diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index ddbe82b1de5cf..654d99c013168 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -57663,6 +57663,14 @@ static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, if (InVec.getOpcode() == ISD::BUILD_VECTOR) return DAG.getBuildVector(VT, DL, InVec->ops().slice(IdxVal, NumSubElts)); + // EXTRACT_SUBVECTOR(EXTRACT_SUBVECTOR(V,C1)),C2) - EXTRACT_SUBVECTOR(V,C1+C2) + if (IdxVal != 0 && InVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && + InVec.hasOneUse() && TLI.isTypeLegal(VT) && + TLI.isTypeLegal(InVec.getOperand(0).getValueType())) { + unsigned NewIdx = IdxVal + InVec.getConstantOperandVal(1); + return extractSubVector(InVec.getOperand(0), NewIdx, DAG, DL, SizeInBits); + } + // If we are extracting from an insert into a larger vector, replace with a // smaller insert if we don't access less than the original subvector. Don't // do this for i1 vectors. diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll index cc22da4aa61d7..4972d3e4ec72b 100644 --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll @@ -2470,8 +2470,7 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] @@ -2609,8 +2608,7 @@ define void @vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16(ptr %in ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1 ; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15] ; AVX512BW-NEXT: vpbroadcastb %xmm0, %xmm0 @@ -2740,8 +2738,7 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1 ; AVX512BW-NEXT: vpbroadcastd %xmm0, %ymm2 ; AVX512BW-NEXT: movl $286331153, %eax # imm = 0x11111111 ; AVX512BW-NEXT: kmovd %eax, %k1 @@ -2879,8 +2876,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.v ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1 ; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] ; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0 @@ -3010,8 +3006,7 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1 ; AVX512BW-NEXT: vpbroadcastq %xmm0, %ymm2 ; AVX512BW-NEXT: movl $16843009, %eax # imm = 0x1010101 ; AVX512BW-NEXT: kmovd %eax, %k1 @@ -3148,8 +3143,7 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in. ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1 ; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14] ; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0 @@ -3290,8 +3284,7 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1] ; AVX512BW-NEXT: movl $65537, %eax # imm = 0x10001 ; AVX512BW-NEXT: kmovd %eax, %k1 @@ -3407,8 +3400,7 @@ define void @vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2(ptr %i ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1 ; AVX512BW-NEXT: movw $1, %ax ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1} @@ -4565,17 +4557,30 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; -; AVX512BW-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,13,14,15,0,1,2,3] -; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm1 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512BW-SLOW-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: +; AVX512BW-SLOW: # %bb.0: +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vextracti32x4 $3, %zmm0, %xmm1 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4,5,6,7] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-SLOW-NEXT: vzeroupper +; AVX512BW-SLOW-NEXT: retq +; +; AVX512BW-FAST-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: +; AVX512BW-FAST: # %bb.0: +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,13,14,15,0,1,2,3] +; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-FAST-NEXT: vzeroupper +; AVX512BW-FAST-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll index 60d5f74c7a364..1b9e9b200a9e3 100644 --- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll @@ -4201,27 +4201,43 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask7(<8 x double> ret <4 x double> %res } define <2 x double> @test_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec) { -; CHECK-LABEL: test_8xdouble_to_2xdouble_perm_mask0: -; CHECK: # %bb.0: -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm1 -; CHECK-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; CHECK-FAST-LABEL: test_8xdouble_to_2xdouble_perm_mask0: +; CHECK-FAST: # %bb.0: +; CHECK-FAST-NEXT: vmovaps {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0] +; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; CHECK-FAST-NEXT: vzeroupper +; CHECK-FAST-NEXT: retq +; +; CHECK-FAST-PERLANE-LABEL: test_8xdouble_to_2xdouble_perm_mask0: +; CHECK-FAST-PERLANE: # %bb.0: +; CHECK-FAST-PERLANE-NEXT: vextractf32x4 $3, %zmm0, %xmm1 +; CHECK-FAST-PERLANE-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-FAST-PERLANE-NEXT: vzeroupper +; CHECK-FAST-PERLANE-NEXT: retq %res = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> ret <2 x double> %res } define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec, <2 x double> %vec2, <2 x double> %mask) { -; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask0: -; CHECK: # %bb.0: -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm3 -; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vcmpeqpd %xmm4, %xmm2, %k1 -; CHECK-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],xmm3[0] -; CHECK-NEXT: vmovapd %xmm1, %xmm0 -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; CHECK-FAST-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask0: +; CHECK-FAST: # %bb.0: +; CHECK-FAST-NEXT: vmovapd {{.*#+}} xmm3 = [0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0] +; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm3, %zmm0 +; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; CHECK-FAST-NEXT: vcmpeqpd %xmm3, %xmm2, %k1 +; CHECK-FAST-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1} +; CHECK-FAST-NEXT: vzeroupper +; CHECK-FAST-NEXT: retq +; +; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask0: +; CHECK-FAST-PERLANE: # %bb.0: +; CHECK-FAST-PERLANE-NEXT: vextractf32x4 $3, %zmm0, %xmm3 +; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm4, %xmm4, %xmm4 +; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %xmm4, %xmm2, %k1 +; CHECK-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],xmm3[0] +; CHECK-FAST-PERLANE-NEXT: vmovapd %xmm1, %xmm0 +; CHECK-FAST-PERLANE-NEXT: vzeroupper +; CHECK-FAST-PERLANE-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> %cmp = fcmp oeq <2 x double> %mask, zeroinitializer %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2 @@ -4229,15 +4245,24 @@ define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mask0(<8 x double> %v } define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec, <2 x double> %mask) { -; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mask0: -; CHECK: # %bb.0: -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm2 -; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1 -; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm2[0] -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mask0: +; CHECK-FAST: # %bb.0: +; CHECK-FAST-NEXT: vmovapd {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0] +; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; CHECK-FAST-NEXT: vcmpeqpd %xmm3, %xmm1, %k1 +; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} +; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; CHECK-FAST-NEXT: vzeroupper +; CHECK-FAST-NEXT: retq +; +; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mask0: +; CHECK-FAST-PERLANE: # %bb.0: +; CHECK-FAST-PERLANE-NEXT: vextractf32x4 $3, %zmm0, %xmm2 +; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %xmm3, %xmm1, %k1 +; CHECK-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm2[0] +; CHECK-FAST-PERLANE-NEXT: vzeroupper +; CHECK-FAST-PERLANE-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> %cmp = fcmp oeq <2 x double> %mask, zeroinitializer %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer diff --git a/llvm/test/CodeGen/X86/kshift.ll b/llvm/test/CodeGen/X86/kshift.ll index f4efacc1946cf..a3b5d8aee03c1 100644 --- a/llvm/test/CodeGen/X86/kshift.ll +++ b/llvm/test/CodeGen/X86/kshift.ll @@ -271,8 +271,7 @@ define i64 @kshiftl_v64i1_63(<64 x i8> %x, <64 x i8> %y) { ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kshiftlw $15, %k0, %k1 -; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; KNL-NEXT: vextracti32x4 $3, %zmm1, %xmm0 ; KNL-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} @@ -562,8 +561,7 @@ define i32 @kshiftr_v32i1_31(<32 x i16> %x, <32 x i16> %y) { define i64 @kshiftr_v64i1_63(<64 x i8> %x, <64 x i8> %y) { ; KNL-LABEL: kshiftr_v64i1_63: ; KNL: # %bb.0: -; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; KNL-NEXT: vextracti32x4 $3, %zmm0, %xmm0 ; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; KNL-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll index bad0b411f68a9..1b80fcdedb43f 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll @@ -433,8 +433,7 @@ define <8 x i32> @test_v16i32_1_3_5_7_9_11_13_15(<16 x i32> %v) { define <4 x i32> @test_v16i32_0_1_2_12 (<16 x i32> %v) { ; SLOW-LABEL: test_v16i32_0_1_2_12: ; SLOW: # %bb.0: -; SLOW-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 +; SLOW-NEXT: vextractf32x4 $3, %zmm0, %xmm1 ; SLOW-NEXT: vbroadcastss %xmm1, %xmm1 ; SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; SLOW-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll index 2387e05729661..97b262cc7ac5c 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -2189,13 +2189,21 @@ define <4 x i64> @test_v8i64_1257 (<8 x i64> %v) { } define <2 x i64> @test_v8i64_2_5 (<8 x i64> %v) { -; ALL-LABEL: test_v8i64_2_5: -; ALL: # %bb.0: -; ALL-NEXT: vextractf32x4 $2, %zmm0, %xmm1 -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 -; ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; ALL-NEXT: vzeroupper -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: test_v8i64_2_5: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps {{.*#+}} xmm1 = [2,5] +; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: test_v8i64_2_5: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovaps {{.*#+}} xmm1 = [2,0,5,0] +; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-32-NEXT: vzeroupper +; AVX512F-32-NEXT: retl %res = shufflevector <8 x i64> %v, <8 x i64> undef, <2 x i32> ret <2 x i64> %res } diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll index 8cf277aa9796e..ce092f9d343fc 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll @@ -2478,8 +2478,7 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] @@ -2631,8 +2630,7 @@ define void @vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16(ptr %in ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1 ; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15] ; AVX512BW-NEXT: vpbroadcastb %xmm0, %xmm2 @@ -2775,8 +2773,7 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1 ; AVX512BW-NEXT: vpbroadcastd %xmm0, %ymm2 ; AVX512BW-NEXT: movl $286331153, %eax # imm = 0x11111111 ; AVX512BW-NEXT: kmovd %eax, %k1 @@ -2927,8 +2924,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.v ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1 ; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] ; AVX512BW-NEXT: vpbroadcastb %xmm0, %xmm2 @@ -3071,8 +3067,7 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1 ; AVX512BW-NEXT: vpbroadcastq %xmm0, %ymm2 ; AVX512BW-NEXT: movl $16843009, %eax # imm = 0x1010101 ; AVX512BW-NEXT: kmovd %eax, %k1 @@ -3222,8 +3217,7 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in. ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1 ; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14] ; AVX512BW-NEXT: vpbroadcastb %xmm0, %xmm2 @@ -3370,8 +3364,7 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i ; AVX512BW-NEXT: movw $1, %ax ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1} {z} -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm2 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512BW-NEXT: movl $65537, %eax # imm = 0x10001 ; AVX512BW-NEXT: kmovd %eax, %k1 @@ -3508,8 +3501,7 @@ define void @vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2(ptr %i ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1 ; AVX512BW-NEXT: movw $1, %ax ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1} @@ -5110,21 +5102,36 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; -; AVX512BW-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,13,14,15,0,1,2,3] -; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm1 -; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512BW-SLOW-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: +; AVX512BW-SLOW: # %bb.0: +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-SLOW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,13,14,15,0,21,22,23] +; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 +; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-SLOW-NEXT: vzeroupper +; AVX512BW-SLOW-NEXT: retq +; +; AVX512BW-FAST-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: +; AVX512BW-FAST: # %bb.0: +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,13,14,15,0,1,2,3] +; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-FAST-NEXT: vzeroupper +; AVX512BW-FAST-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias