diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 2571873dba848..2479bc3fd8f08 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -58706,11 +58706,30 @@ static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG, static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { EVT VT = N->getValueType(0); - + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode())) return DAG.getConstant(0, SDLoc(N), VT); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + // Fold kshiftr(extract_subvector(X,C1),C2) + // --> extract_subvector(kshiftr(X,C1+C2),0) + // Fold kshiftr(kshiftr(X,C1),C2) --> kshiftr(X,C1+C2) + if (N->getOpcode() == X86ISD::KSHIFTR) { + SDLoc DL(N); + if (N->getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR || + N->getOperand(0).getOpcode() == X86ISD::KSHIFTR) { + SDValue Src = N->getOperand(0).getOperand(0); + uint64_t Amt = N->getConstantOperandVal(1) + + N->getOperand(0).getConstantOperandVal(1); + EVT SrcVT = Src.getValueType(); + if (TLI.isTypeLegal(SrcVT) && Amt < SrcVT.getVectorNumElements()) { + SDValue Shift = DAG.getNode(X86ISD::KSHIFTR, DL, SrcVT, Src, + DAG.getTargetConstant(Amt, DL, MVT::i8)); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shift, + DAG.getIntPtrConstant(0, DL)); + } + } + } + APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements()); if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI)) return SDValue(N, 0); diff --git a/llvm/test/CodeGen/X86/avx512-bugfix-26264.ll b/llvm/test/CodeGen/X86/avx512-bugfix-26264.ll index 537f42dd9c2c5..e0f3b6c4ec90a 100644 --- a/llvm/test/CodeGen/X86/avx512-bugfix-26264.ll +++ b/llvm/test/CodeGen/X86/avx512-bugfix-26264.ll @@ -7,11 +7,11 @@ define <32 x double> @test_load_32f64(ptr %ptrs, <32 x i1> %mask, <32 x double> ; AVX512BW-NEXT: vpsllw $7, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovb2m %zmm0, %k1 ; AVX512BW-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1} -; AVX512BW-NEXT: kshiftrw $8, %k1, %k2 +; AVX512BW-NEXT: kshiftrd $8, %k1, %k2 ; AVX512BW-NEXT: vblendmpd 64(%rdi), %zmm2, %zmm1 {%k2} -; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 -; AVX512BW-NEXT: vblendmpd 128(%rdi), %zmm3, %zmm2 {%k1} -; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 +; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-NEXT: vblendmpd 128(%rdi), %zmm3, %zmm2 {%k2} +; AVX512BW-NEXT: kshiftrd $24, %k1, %k1 ; AVX512BW-NEXT: vblendmpd 192(%rdi), %zmm4, %zmm3 {%k1} ; AVX512BW-NEXT: retq %res = call <32 x double> @llvm.masked.load.v32f64.p0(ptr %ptrs, i32 4, <32 x i1> %mask, <32 x double> %src0) @@ -24,11 +24,11 @@ define <32 x i64> @test_load_32i64(ptr %ptrs, <32 x i1> %mask, <32 x i64> %src0) ; AVX512BW-NEXT: vpsllw $7, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovb2m %zmm0, %k1 ; AVX512BW-NEXT: vpblendmq (%rdi), %zmm1, %zmm0 {%k1} -; AVX512BW-NEXT: kshiftrw $8, %k1, %k2 +; AVX512BW-NEXT: kshiftrd $8, %k1, %k2 ; AVX512BW-NEXT: vpblendmq 64(%rdi), %zmm2, %zmm1 {%k2} -; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 -; AVX512BW-NEXT: vpblendmq 128(%rdi), %zmm3, %zmm2 {%k1} -; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 +; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-NEXT: vpblendmq 128(%rdi), %zmm3, %zmm2 {%k2} +; AVX512BW-NEXT: kshiftrd $24, %k1, %k1 ; AVX512BW-NEXT: vpblendmq 192(%rdi), %zmm4, %zmm3 {%k1} ; AVX512BW-NEXT: retq %res = call <32 x i64> @llvm.masked.load.v32i64.p0(ptr %ptrs, i32 4, <32 x i1> %mask, <32 x i64> %src0) diff --git a/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll b/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll index bd52b9cd41584..f6e5986afac53 100644 --- a/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll +++ b/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll @@ -261,11 +261,11 @@ define <32 x double> @test_load_32f64(ptr %ptrs, <32 x i1> %mask, <32 x double> ; SKX-NEXT: vpsllw $7, %ymm0, %ymm0 ; SKX-NEXT: vpmovb2m %ymm0, %k1 ; SKX-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1} -; SKX-NEXT: kshiftrw $8, %k1, %k2 +; SKX-NEXT: kshiftrd $8, %k1, %k2 ; SKX-NEXT: vblendmpd 64(%rdi), %zmm2, %zmm1 {%k2} -; SKX-NEXT: kshiftrd $16, %k1, %k1 -; SKX-NEXT: vblendmpd 128(%rdi), %zmm3, %zmm2 {%k1} -; SKX-NEXT: kshiftrw $8, %k1, %k1 +; SKX-NEXT: kshiftrd $16, %k1, %k2 +; SKX-NEXT: vblendmpd 128(%rdi), %zmm3, %zmm2 {%k2} +; SKX-NEXT: kshiftrd $24, %k1, %k1 ; SKX-NEXT: vblendmpd 192(%rdi), %zmm4, %zmm3 {%k1} ; SKX-NEXT: retq %res = call <32 x double> @llvm.masked.load.v32f64.p0(ptr %ptrs, i32 4, <32 x i1> %mask, <32 x double> %src0) diff --git a/llvm/test/CodeGen/X86/pr33349.ll b/llvm/test/CodeGen/X86/pr33349.ll index 83d3a33572266..c879cb9867ab2 100644 --- a/llvm/test/CodeGen/X86/pr33349.ll +++ b/llvm/test/CodeGen/X86/pr33349.ll @@ -17,23 +17,23 @@ target triple = "x86_64-unknown-linux-gnu" ; KNL-NEXT: fldz ; KNL-NEXT: fld %st(0) ; KNL-NEXT: fcmovne %st(2), %st -; KNL-NEXT: testb $2, %al -; KNL-NEXT: fld %st(1) -; KNL-NEXT: fcmovne %st(3), %st ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: testb $1, %al +; KNL-NEXT: fld %st(1) +; KNL-NEXT: fcmovne %st(3), %st +; KNL-NEXT: testb $2, %al ; KNL-NEXT: fld %st(2) ; KNL-NEXT: fcmovne %st(4), %st -; KNL-NEXT: testb $2, %al +; KNL-NEXT: testb $8, %al ; KNL-NEXT: fxch %st(3) ; KNL-NEXT: fcmovne %st(4), %st ; KNL-NEXT: fstp %st(4) ; KNL-NEXT: fxch %st(3) +; KNL-NEXT: fstpt 30(%rdi) +; KNL-NEXT: fxch %st(1) ; KNL-NEXT: fstpt 10(%rdi) ; KNL-NEXT: fxch %st(1) ; KNL-NEXT: fstpt (%rdi) -; KNL-NEXT: fxch %st(1) -; KNL-NEXT: fstpt 30(%rdi) ; KNL-NEXT: fstpt 20(%rdi) ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq @@ -49,23 +49,23 @@ target triple = "x86_64-unknown-linux-gnu" ; SKX-NEXT: fldz ; SKX-NEXT: fld %st(0) ; SKX-NEXT: fcmovne %st(2), %st -; SKX-NEXT: testb $2, %al -; SKX-NEXT: fld %st(1) -; SKX-NEXT: fcmovne %st(3), %st ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: testb $1, %al +; SKX-NEXT: fld %st(1) +; SKX-NEXT: fcmovne %st(3), %st +; SKX-NEXT: testb $2, %al ; SKX-NEXT: fld %st(2) ; SKX-NEXT: fcmovne %st(4), %st -; SKX-NEXT: testb $2, %al +; SKX-NEXT: testb $8, %al ; SKX-NEXT: fxch %st(3) ; SKX-NEXT: fcmovne %st(4), %st ; SKX-NEXT: fstp %st(4) ; SKX-NEXT: fxch %st(3) +; SKX-NEXT: fstpt 30(%rdi) +; SKX-NEXT: fxch %st(1) ; SKX-NEXT: fstpt 10(%rdi) ; SKX-NEXT: fxch %st(1) ; SKX-NEXT: fstpt (%rdi) -; SKX-NEXT: fxch %st(1) -; SKX-NEXT: fstpt 30(%rdi) ; SKX-NEXT: fstpt 20(%rdi) ; SKX-NEXT: retq bb: diff --git a/llvm/test/CodeGen/X86/pr34177.ll b/llvm/test/CodeGen/X86/pr34177.ll index 29922c2ac1a71..5b2431eb21495 100644 --- a/llvm/test/CodeGen/X86/pr34177.ll +++ b/llvm/test/CodeGen/X86/pr34177.ll @@ -51,18 +51,18 @@ define void @test(<4 x i64> %a, <4 x x86_fp80> %b, ptr %c) local_unnamed_addr { ; AVX512VL-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 ; AVX512VL-NEXT: kshiftrb $2, %k0, %k1 ; AVX512VL-NEXT: kmovd %k0, %eax -; AVX512VL-NEXT: testb $2, %al +; AVX512VL-NEXT: testb $8, %al ; AVX512VL-NEXT: fld1 ; AVX512VL-NEXT: fldz ; AVX512VL-NEXT: fld %st(0) ; AVX512VL-NEXT: fcmovne %st(2), %st -; AVX512VL-NEXT: testb $1, %al +; AVX512VL-NEXT: testb $2, %al ; AVX512VL-NEXT: fld %st(1) ; AVX512VL-NEXT: fcmovne %st(3), %st -; AVX512VL-NEXT: kmovd %k1, %eax -; AVX512VL-NEXT: testb $2, %al +; AVX512VL-NEXT: testb $1, %al ; AVX512VL-NEXT: fld %st(2) ; AVX512VL-NEXT: fcmovne %st(4), %st +; AVX512VL-NEXT: kmovd %k1, %eax ; AVX512VL-NEXT: testb $1, %al ; AVX512VL-NEXT: fxch %st(3) ; AVX512VL-NEXT: fcmovne %st(4), %st @@ -77,12 +77,12 @@ define void @test(<4 x i64> %a, <4 x x86_fp80> %b, ptr %c) local_unnamed_addr { ; AVX512VL-NEXT: fstpt 10(%rdi) ; AVX512VL-NEXT: fxch %st(1) ; AVX512VL-NEXT: fadd %st, %st(0) +; AVX512VL-NEXT: fstpt 60(%rdi) +; AVX512VL-NEXT: fadd %st, %st(0) ; AVX512VL-NEXT: fstpt 20(%rdi) ; AVX512VL-NEXT: fadd %st, %st(0) ; AVX512VL-NEXT: fstpt (%rdi) ; AVX512VL-NEXT: fadd %st, %st(0) -; AVX512VL-NEXT: fstpt 60(%rdi) -; AVX512VL-NEXT: fadd %st, %st(0) ; AVX512VL-NEXT: fstpt 40(%rdi) %1 = icmp eq <4 x i64> , %a %2 = select <4 x i1> %1, <4 x x86_fp80> , <4 x x86_fp80> zeroinitializer diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll index 22b5246443fa8..7e081310c35be 100644 --- a/llvm/test/CodeGen/X86/vec_smulo.ll +++ b/llvm/test/CodeGen/X86/vec_smulo.ll @@ -2668,11 +2668,11 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 ; AVX512BW-NEXT: vpcmpneqb %zmm1, %zmm0, %k1 ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-NEXT: kshiftrq $16, %k1, %k2 ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm1 {%k2} {z} = -1 -; AVX512BW-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1 -; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-NEXT: kshiftrq $32, %k1, %k2 +; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm2 {%k2} {z} = -1 +; AVX512BW-NEXT: kshiftrq $48, %k1, %k1 ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1 ; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rdi) ; AVX512BW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll index 4d7d2573183e0..68c6ca93576b7 100644 --- a/llvm/test/CodeGen/X86/vec_umulo.ll +++ b/llvm/test/CodeGen/X86/vec_umulo.ll @@ -2329,11 +2329,11 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vptestmb %zmm0, %zmm0, %k1 ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-NEXT: kshiftrq $16, %k1, %k2 ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm1 {%k2} {z} = -1 -; AVX512BW-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1 -; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-NEXT: kshiftrq $32, %k1, %k2 +; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm2 {%k2} {z} = -1 +; AVX512BW-NEXT: kshiftrq $48, %k1, %k1 ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1 ; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rdi) ; AVX512BW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-compress.ll b/llvm/test/CodeGen/X86/vector-compress.ll index f8c076db65de9..17b98b5ebcaea 100644 --- a/llvm/test/CodeGen/X86/vector-compress.ll +++ b/llvm/test/CodeGen/X86/vector-compress.ll @@ -840,12 +840,12 @@ define <64 x i32> @test_compress_large(<64 x i1> %mask, <64 x i32> %vec, <64 x i ; AVX512VL-NEXT: subq $576, %rsp # imm = 0x240 ; AVX512VL-NEXT: vpsllw $7, %zmm0, %zmm0 ; AVX512VL-NEXT: vpmovb2m %zmm0, %k1 +; AVX512VL-NEXT: kshiftrq $48, %k1, %k3 ; AVX512VL-NEXT: kshiftrq $32, %k1, %k4 -; AVX512VL-NEXT: kshiftrd $16, %k4, %k3 -; AVX512VL-NEXT: kshiftrd $16, %k1, %k2 +; AVX512VL-NEXT: kshiftrq $16, %k1, %k2 ; AVX512VL-NEXT: vpcompressd %zmm1, %zmm0 {%k1} {z} ; AVX512VL-NEXT: vmovdqa64 %zmm0, (%rsp) -; AVX512VL-NEXT: kshiftrw $8, %k1, %k0 +; AVX512VL-NEXT: kshiftrq $8, %k1, %k0 ; AVX512VL-NEXT: kxorw %k0, %k1, %k0 ; AVX512VL-NEXT: kshiftrw $4, %k0, %k5 ; AVX512VL-NEXT: kxorw %k5, %k0, %k0 @@ -859,7 +859,7 @@ define <64 x i32> @test_compress_large(<64 x i1> %mask, <64 x i32> %vec, <64 x i ; AVX512VL-NEXT: vmovdqa64 %zmm0, (%rsp,%rax,4) ; AVX512VL-NEXT: vpcompressd %zmm3, %zmm0 {%k4} {z} ; AVX512VL-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512VL-NEXT: kshiftrw $8, %k4, %k0 +; AVX512VL-NEXT: kshiftrq $40, %k1, %k0 ; AVX512VL-NEXT: kxorw %k0, %k4, %k0 ; AVX512VL-NEXT: kshiftrw $4, %k0, %k4 ; AVX512VL-NEXT: kxorw %k4, %k0, %k0 diff --git a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll index 358b2a503df26..a8df418143f32 100644 --- a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll +++ b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll @@ -256,12 +256,12 @@ define void @mask_replication_factor2_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,2,3,2,3] ; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k1 -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-ONLY-NEXT: kshiftrq $16, %k1, %k2 ; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} ; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} -; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-ONLY-NEXT: kshiftrq $48, %k1, %k2 ; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 ; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) @@ -277,12 +277,12 @@ define void @mask_replication_factor2_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k1 -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k1, %k2 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} ; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} -; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k1, %k2 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) @@ -409,19 +409,19 @@ define void @mask_replication_factor2_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,2,3,2,3] ; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm0, %zmm0 ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k2 -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3 +; AVX512BW-ONLY-NEXT: kshiftrq $16, %k2, %k3 ; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z} ; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} -; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k2 -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3 +; AVX512BW-ONLY-NEXT: kshiftrq $48, %k2, %k3 ; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k2 ; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z} -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-ONLY-NEXT: kshiftrq $16, %k1, %k2 ; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z} ; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z} -; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-ONLY-NEXT: kshiftrq $48, %k1, %k2 ; AVX512BW-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 ; AVX512BW-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx) ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx) @@ -444,19 +444,19 @@ define void @mask_replication_factor2_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k2 -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3 +; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k2, %k3 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z} ; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} -; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k2 -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3 +; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k2, %k3 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k2 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z} -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k1, %k2 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z} ; AVX512VBMI-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z} -; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k1, %k2 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx) ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx) @@ -2605,12 +2605,12 @@ define void @mask_replication_factor4_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] ; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,20,20,20,20,21,21,21,21,22,22,22,22,23,23,23,23,40,40,40,40,41,41,41,41,42,42,42,42,43,43,43,43,60,60,60,60,61,61,61,61,62,62,62,62,63,63,63,63] ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k1 -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-ONLY-NEXT: kshiftrq $16, %k1, %k2 ; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} ; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} -; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-ONLY-NEXT: kshiftrq $48, %k1, %k2 ; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 ; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) @@ -2626,12 +2626,12 @@ define void @mask_replication_factor4_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k1 -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k1, %k2 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} ; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} -; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k1, %k2 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) @@ -2753,19 +2753,19 @@ define void @mask_replication_factor4_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] ; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm0, %zmm0 ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k2 -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3 +; AVX512BW-ONLY-NEXT: kshiftrq $16, %k2, %k3 ; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z} ; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} -; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k2 -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3 +; AVX512BW-ONLY-NEXT: kshiftrq $48, %k2, %k3 ; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k2 ; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z} -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-ONLY-NEXT: kshiftrq $16, %k1, %k2 ; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z} ; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z} -; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-ONLY-NEXT: kshiftrq $48, %k1, %k2 ; AVX512BW-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 ; AVX512BW-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx) ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx) @@ -2788,19 +2788,19 @@ define void @mask_replication_factor4_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k2 -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3 +; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k2, %k3 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z} ; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} -; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k2 -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3 +; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k2, %k3 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k2 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z} -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k1, %k2 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z} ; AVX512VBMI-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z} -; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k1, %k2 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx) ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx) @@ -3000,33 +3000,33 @@ define void @mask_replication_factor4_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] ; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm0, %zmm0 ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k4 -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k4, %k5 +; AVX512BW-ONLY-NEXT: kshiftrq $16, %k4, %k5 ; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k5} {z} ; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k4} {z} -; AVX512BW-ONLY-NEXT: kshiftrq $32, %k4, %k4 -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k4, %k5 +; AVX512BW-ONLY-NEXT: kshiftrq $48, %k4, %k5 ; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k5} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k4, %k4 ; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k4} {z} -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k3, %k4 +; AVX512BW-ONLY-NEXT: kshiftrq $16, %k3, %k4 ; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k4} {z} ; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z} -; AVX512BW-ONLY-NEXT: kshiftrq $32, %k3, %k3 -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k3, %k4 +; AVX512BW-ONLY-NEXT: kshiftrq $48, %k3, %k4 ; AVX512BW-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k4} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k3, %k3 ; AVX512BW-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k3} {z} -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3 +; AVX512BW-ONLY-NEXT: kshiftrq $16, %k2, %k3 ; AVX512BW-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm8 {%k3} {z} ; AVX512BW-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm9 {%k2} {z} -; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k2 -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3 +; AVX512BW-ONLY-NEXT: kshiftrq $48, %k2, %k3 ; AVX512BW-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm10 {%k3} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k2 ; AVX512BW-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm11 {%k2} {z} -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-ONLY-NEXT: kshiftrq $16, %k1, %k2 ; AVX512BW-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm12 {%k2} {z} ; AVX512BW-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm13 {%k1} {z} -; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-ONLY-NEXT: kshiftrq $48, %k1, %k2 ; AVX512BW-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm14 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 ; AVX512BW-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm15 {%k1} {z} ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm15, 896(%rdx) ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm14, 960(%rdx) @@ -3063,33 +3063,33 @@ define void @mask_replication_factor4_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k4 -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k4, %k5 +; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k4, %k5 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k5} {z} ; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k4} {z} -; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k4, %k4 -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k4, %k5 +; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k4, %k5 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k5} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k4, %k4 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k4} {z} -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k3, %k4 +; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k3, %k4 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k4} {z} ; AVX512VBMI-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z} -; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k3, %k3 -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k3, %k4 +; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k3, %k4 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k4} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k3, %k3 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k3} {z} -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3 +; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k2, %k3 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm8 {%k3} {z} ; AVX512VBMI-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm9 {%k2} {z} -; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k2 -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3 +; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k2, %k3 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm10 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k2 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm11 {%k2} {z} -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k1, %k2 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm12 {%k2} {z} ; AVX512VBMI-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm13 {%k1} {z} -; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k1, %k2 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm14 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm15 {%k1} {z} ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm15, 896(%rdx) ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm14, 960(%rdx) @@ -3309,14 +3309,14 @@ define void @mask_replication_factor5_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512BW-ONLY-NEXT: movabsq $1099511627775, %rax # imm = 0xFFFFFFFFFF ; AVX512BW-ONLY-NEXT: kmovq %rax, %k1 ; AVX512BW-ONLY-NEXT: vpcmpgtb %zmm0, %zmm1, %k1 {%k1} -; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} ; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k2 -; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm1 {%k2} {z} -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm0 {%k2} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $16, %k1, %k1 ; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k1} {z} ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 64(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa %ymm1, 128(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa %ymm0, 128(%rdx) ; AVX512BW-ONLY-NEXT: vzeroupper ; AVX512BW-ONLY-NEXT: retq ; @@ -3330,14 +3330,14 @@ define void @mask_replication_factor5_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512VBMI-ONLY-NEXT: movabsq $1099511627775, %rax # imm = 0xFFFFFFFFFF ; AVX512VBMI-ONLY-NEXT: kmovq %rax, %k1 ; AVX512VBMI-ONLY-NEXT: vpcmpgtb %zmm0, %zmm1, %k1 {%k1} -; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} ; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k2 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm1 {%k2} {z} -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm0 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k1, %k1 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k1} {z} ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 64(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa %ymm1, 128(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa %ymm0, 128(%rdx) ; AVX512VBMI-ONLY-NEXT: vzeroupper ; AVX512VBMI-ONLY-NEXT: retq %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 @@ -9338,12 +9338,12 @@ define void @mask_replication_factor7_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512BW-ONLY-NEXT: movabsq $72057594037927935, %rax # imm = 0xFFFFFFFFFFFFFF ; AVX512BW-ONLY-NEXT: kmovq %rax, %k1 ; AVX512BW-ONLY-NEXT: vpcmpgtb %zmm0, %zmm1, %k1 {%k1} -; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k2 -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3 -; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm0 {%k3} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $48, %k1, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm0 {%k2} {z} ; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k2 ; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k2} {z} -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-ONLY-NEXT: kshiftrq $16, %k1, %k1 ; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm3 {%k1} {z} ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 64(%rdx) ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) @@ -9362,12 +9362,12 @@ define void @mask_replication_factor7_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512VBMI-ONLY-NEXT: movabsq $72057594037927935, %rax # imm = 0xFFFFFFFFFFFFFF ; AVX512VBMI-ONLY-NEXT: kmovq %rax, %k1 ; AVX512VBMI-ONLY-NEXT: vpcmpgtb %zmm0, %zmm1, %k1 {%k1} -; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k2 -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm0 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k1, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm0 {%k2} {z} ; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k2 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k2} {z} -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k1, %k1 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm3 {%k1} {z} ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 64(%rdx) ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) @@ -12938,12 +12938,12 @@ define void @mask_replication_factor8_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512BW-ONLY-NEXT: vpbroadcastq %xmm0, %zmm0 ; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19,36,36,36,36,36,36,36,36,37,37,37,37,37,37,37,37,54,54,54,54,54,54,54,54,55,55,55,55,55,55,55,55] ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k1 -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-ONLY-NEXT: kshiftrq $16, %k1, %k2 ; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} ; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} -; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-ONLY-NEXT: kshiftrq $48, %k1, %k2 ; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 ; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) @@ -12959,12 +12959,12 @@ define void @mask_replication_factor8_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k1 -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k1, %k2 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} ; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} -; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k1, %k2 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) @@ -13088,19 +13088,19 @@ define void @mask_replication_factor8_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19,36,36,36,36,36,36,36,36,37,37,37,37,37,37,37,37,54,54,54,54,54,54,54,54,55,55,55,55,55,55,55,55] ; AVX512BW-NEXT: vpmovb2m %zmm0, %k2 -; AVX512BW-NEXT: kshiftrd $16, %k2, %k3 +; AVX512BW-NEXT: kshiftrq $16, %k2, %k3 ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z} ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} -; AVX512BW-NEXT: kshiftrq $32, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $16, %k2, %k3 +; AVX512BW-NEXT: kshiftrq $48, %k2, %k3 ; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z} +; AVX512BW-NEXT: kshiftrq $32, %k2, %k2 ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z} -; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-NEXT: kshiftrq $16, %k1, %k2 ; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z} ; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z} -; AVX512BW-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-NEXT: kshiftrq $48, %k1, %k2 ; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} +; AVX512BW-NEXT: kshiftrq $32, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} ; AVX512BW-NEXT: vmovdqa64 %zmm7, 384(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm6, 448(%rdx) @@ -13299,33 +13299,33 @@ define void @mask_replication_factor8_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: vpmovb2m %zmm1, %k3 ; AVX512BW-NEXT: vpshufb %zmm3, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovb2m %zmm0, %k4 -; AVX512BW-NEXT: kshiftrd $16, %k4, %k5 +; AVX512BW-NEXT: kshiftrq $16, %k4, %k5 ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k5} {z} ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k4} {z} -; AVX512BW-NEXT: kshiftrq $32, %k4, %k4 -; AVX512BW-NEXT: kshiftrd $16, %k4, %k5 +; AVX512BW-NEXT: kshiftrq $48, %k4, %k5 ; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k5} {z} +; AVX512BW-NEXT: kshiftrq $32, %k4, %k4 ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k4} {z} -; AVX512BW-NEXT: kshiftrd $16, %k3, %k4 +; AVX512BW-NEXT: kshiftrq $16, %k3, %k4 ; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k4} {z} ; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z} -; AVX512BW-NEXT: kshiftrq $32, %k3, %k3 -; AVX512BW-NEXT: kshiftrd $16, %k3, %k4 +; AVX512BW-NEXT: kshiftrq $48, %k3, %k4 ; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k4} {z} +; AVX512BW-NEXT: kshiftrq $32, %k3, %k3 ; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k3} {z} -; AVX512BW-NEXT: kshiftrd $16, %k2, %k3 +; AVX512BW-NEXT: kshiftrq $16, %k2, %k3 ; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm8 {%k3} {z} ; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm9 {%k2} {z} -; AVX512BW-NEXT: kshiftrq $32, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $16, %k2, %k3 +; AVX512BW-NEXT: kshiftrq $48, %k2, %k3 ; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm10 {%k3} {z} +; AVX512BW-NEXT: kshiftrq $32, %k2, %k2 ; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm11 {%k2} {z} -; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-NEXT: kshiftrq $16, %k1, %k2 ; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm12 {%k2} {z} ; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm13 {%k1} {z} -; AVX512BW-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-NEXT: kshiftrq $48, %k1, %k2 ; AVX512BW-NEXT: vmovdqa32 960(%rsi), %zmm14 {%k2} {z} +; AVX512BW-NEXT: kshiftrq $32, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 896(%rsi), %zmm15 {%k1} {z} ; AVX512BW-NEXT: vmovdqa64 %zmm15, 896(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm14, 960(%rdx) @@ -13682,8 +13682,8 @@ define void @mask_replication_factor8_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] ; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm12 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,4,5,4,5,4,5] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm16 -; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm15 +; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm15 +; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm16 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,2,3,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm10 ; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm5 @@ -13691,73 +13691,73 @@ define void @mask_replication_factor8_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: vpshufb %zmm2, %zmm0, %zmm1 ; AVX512BW-NEXT: vpshufb %zmm3, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovb2m %zmm0, %k2 -; AVX512BW-NEXT: kshiftrd $16, %k2, %k1 +; AVX512BW-NEXT: kshiftrq $16, %k2, %k1 ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} +; AVX512BW-NEXT: kshiftrq $32, %k2, %k1 +; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} -; AVX512BW-NEXT: kshiftrq $32, %k2, %k2 -; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k2} {z} -; AVX512BW-NEXT: kshiftrd $16, %k2, %k2 +; AVX512BW-NEXT: kshiftrq $48, %k2, %k2 ; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k2} {z} -; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-NEXT: kshiftrq $16, %k1, %k2 ; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z} +; AVX512BW-NEXT: kshiftrq $32, %k1, %k2 +; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k2} {z} ; AVX512BW-NEXT: vpmovb2m %zmm5, %k2 ; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z} -; AVX512BW-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z} -; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-NEXT: kshiftrq $48, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm8 {%k1} {z} -; AVX512BW-NEXT: kshiftrd $16, %k2, %k1 +; AVX512BW-NEXT: kshiftrq $16, %k2, %k1 ; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k1} {z} +; AVX512BW-NEXT: kshiftrq $32, %k2, %k1 +; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm11 {%k1} {z} ; AVX512BW-NEXT: vpmovb2m %zmm10, %k1 ; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm10 {%k2} {z} -; AVX512BW-NEXT: kshiftrq $32, %k2, %k2 -; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm11 {%k2} {z} -; AVX512BW-NEXT: kshiftrd $16, %k2, %k2 +; AVX512BW-NEXT: kshiftrq $48, %k2, %k2 ; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm13 {%k2} {z} -; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-NEXT: kshiftrq $16, %k1, %k2 ; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm14 {%k2} {z} -; AVX512BW-NEXT: vpmovb2m %zmm15, %k2 -; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm15 {%k1} {z} -; AVX512BW-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 896(%rsi), %zmm17 {%k1} {z} -; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-NEXT: kshiftrq $32, %k1, %k2 +; AVX512BW-NEXT: vmovdqa32 896(%rsi), %zmm17 {%k2} {z} +; AVX512BW-NEXT: vpmovb2m %zmm16, %k2 +; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm16 {%k1} {z} +; AVX512BW-NEXT: kshiftrq $48, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 960(%rsi), %zmm18 {%k1} {z} -; AVX512BW-NEXT: kshiftrd $16, %k2, %k1 +; AVX512BW-NEXT: kshiftrq $16, %k2, %k1 ; AVX512BW-NEXT: vmovdqa32 1088(%rsi), %zmm19 {%k1} {z} -; AVX512BW-NEXT: vpmovb2m %zmm16, %k1 -; AVX512BW-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k2} {z} -; AVX512BW-NEXT: kshiftrq $32, %k2, %k2 -; AVX512BW-NEXT: vmovdqa32 1152(%rsi), %zmm20 {%k2} {z} -; AVX512BW-NEXT: kshiftrd $16, %k2, %k2 +; AVX512BW-NEXT: kshiftrq $32, %k2, %k1 +; AVX512BW-NEXT: vmovdqa32 1152(%rsi), %zmm20 {%k1} {z} +; AVX512BW-NEXT: vpmovb2m %zmm15, %k1 +; AVX512BW-NEXT: vmovdqa32 1024(%rsi), %zmm15 {%k2} {z} +; AVX512BW-NEXT: kshiftrq $48, %k2, %k2 ; AVX512BW-NEXT: vmovdqa32 1216(%rsi), %zmm21 {%k2} {z} -; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-NEXT: kshiftrq $16, %k1, %k2 ; AVX512BW-NEXT: vmovdqa32 1344(%rsi), %zmm22 {%k2} {z} +; AVX512BW-NEXT: kshiftrq $32, %k1, %k2 +; AVX512BW-NEXT: vmovdqa32 1408(%rsi), %zmm23 {%k2} {z} ; AVX512BW-NEXT: vpmovb2m %zmm12, %k2 ; AVX512BW-NEXT: vmovdqa32 1280(%rsi), %zmm12 {%k1} {z} -; AVX512BW-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 1408(%rsi), %zmm23 {%k1} {z} -; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-NEXT: kshiftrq $48, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 1472(%rsi), %zmm24 {%k1} {z} -; AVX512BW-NEXT: kshiftrd $16, %k2, %k1 +; AVX512BW-NEXT: kshiftrq $16, %k2, %k1 ; AVX512BW-NEXT: vmovdqa32 1600(%rsi), %zmm25 {%k1} {z} +; AVX512BW-NEXT: kshiftrq $32, %k2, %k1 +; AVX512BW-NEXT: vmovdqa32 1664(%rsi), %zmm26 {%k1} {z} ; AVX512BW-NEXT: vpmovb2m %zmm7, %k1 ; AVX512BW-NEXT: vmovdqa32 1536(%rsi), %zmm7 {%k2} {z} -; AVX512BW-NEXT: kshiftrq $32, %k2, %k2 -; AVX512BW-NEXT: vmovdqa32 1664(%rsi), %zmm26 {%k2} {z} -; AVX512BW-NEXT: kshiftrd $16, %k2, %k2 +; AVX512BW-NEXT: kshiftrq $48, %k2, %k2 ; AVX512BW-NEXT: vmovdqa32 1728(%rsi), %zmm27 {%k2} {z} -; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-NEXT: kshiftrq $16, %k1, %k2 ; AVX512BW-NEXT: vmovdqa32 1856(%rsi), %zmm28 {%k2} {z} -; AVX512BW-NEXT: vmovdqa32 1792(%rsi), %zmm29 {%k1} {z} -; AVX512BW-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 1920(%rsi), %zmm30 {%k1} {z} -; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-NEXT: kshiftrq $32, %k1, %k2 +; AVX512BW-NEXT: vmovdqa32 1920(%rsi), %zmm29 {%k2} {z} +; AVX512BW-NEXT: vmovdqa32 1792(%rsi), %zmm30 {%k1} {z} +; AVX512BW-NEXT: kshiftrq $48, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 1984(%rsi), %zmm31 {%k1} {z} ; AVX512BW-NEXT: vmovdqa64 %zmm31, 1984(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm30, 1920(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm29, 1920(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm28, 1856(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm29, 1792(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm30, 1792(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm27, 1728(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm26, 1664(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm25, 1600(%rdx) @@ -13769,11 +13769,11 @@ define void @mask_replication_factor8_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: vmovdqa64 %zmm21, 1216(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm20, 1152(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm19, 1088(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 1024(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 1024(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm18, 960(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm17, 896(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm14, 832(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 768(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 768(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm13, 704(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm11, 640(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm9, 576(%rdx)