-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[X86] Fold EXTRACT_SUBVECTOR(ONEUSE(EXTRACT_SUBVECTOR(V,C1))),C2) - EXTRACT_SUBVECTOR(V,C1+C2) #111685
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
…XTRACT_SUBVECTOR(V,C1+C2) Extract from the original source vector whenever possible. This removes a number of dependency bottlenecks and helps a number of shuffle combining cases: either by allowing us to avoid a cross-lane variable shuffle on a slow target by keeping the instruction count below the threshold, or on fast targets make it easier to recognise that the subvectors all came form the same source.
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesExtract from the original source vector whenever possible. This removes a number of dependency bottlenecks and helps a number of shuffle combining cases: either by allowing us to avoid a cross-lane variable shuffle on a slow target by keeping the instruction count below the threshold, or on fast targets make it easier to recognise that the subvectors all came form the same source. Patch is 23.80 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/111685.diff 7 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index ddbe82b1de5cfc..654d99c013168b 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -57663,6 +57663,14 @@ static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
if (InVec.getOpcode() == ISD::BUILD_VECTOR)
return DAG.getBuildVector(VT, DL, InVec->ops().slice(IdxVal, NumSubElts));
+ // EXTRACT_SUBVECTOR(EXTRACT_SUBVECTOR(V,C1)),C2) - EXTRACT_SUBVECTOR(V,C1+C2)
+ if (IdxVal != 0 && InVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ InVec.hasOneUse() && TLI.isTypeLegal(VT) &&
+ TLI.isTypeLegal(InVec.getOperand(0).getValueType())) {
+ unsigned NewIdx = IdxVal + InVec.getConstantOperandVal(1);
+ return extractSubVector(InVec.getOperand(0), NewIdx, DAG, DL, SizeInBits);
+ }
+
// If we are extracting from an insert into a larger vector, replace with a
// smaller insert if we don't access less than the original subvector. Don't
// do this for i1 vectors.
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
index cc22da4aa61d76..4972d3e4ec72bb 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
@@ -2470,8 +2470,7 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
@@ -2609,8 +2608,7 @@ define void @vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16(ptr %in
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15]
; AVX512BW-NEXT: vpbroadcastb %xmm0, %xmm0
@@ -2740,8 +2738,7 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; AVX512BW-NEXT: vpbroadcastd %xmm0, %ymm2
; AVX512BW-NEXT: movl $286331153, %eax # imm = 0x11111111
; AVX512BW-NEXT: kmovd %eax, %k1
@@ -2879,8 +2876,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.v
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0
@@ -3010,8 +3006,7 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; AVX512BW-NEXT: vpbroadcastq %xmm0, %ymm2
; AVX512BW-NEXT: movl $16843009, %eax # imm = 0x1010101
; AVX512BW-NEXT: kmovd %eax, %k1
@@ -3148,8 +3143,7 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in.
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0
@@ -3290,8 +3284,7 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; AVX512BW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
; AVX512BW-NEXT: movl $65537, %eax # imm = 0x10001
; AVX512BW-NEXT: kmovd %eax, %k1
@@ -3407,8 +3400,7 @@ define void @vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2(ptr %i
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; AVX512BW-NEXT: movw $1, %ax
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1}
@@ -4565,17 +4557,30 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
-; AVX512BW-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,13,14,15,0,1,2,3]
-; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm1
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
+; AVX512BW-SLOW-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
+; AVX512BW-SLOW: # %bb.0:
+; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-SLOW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
+; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
+; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4,5,6,7]
+; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-SLOW-NEXT: vzeroupper
+; AVX512BW-SLOW-NEXT: retq
+;
+; AVX512BW-FAST-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
+; AVX512BW-FAST: # %bb.0:
+; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,13,14,15,0,1,2,3]
+; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1
+; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-FAST-NEXT: vzeroupper
+; AVX512BW-FAST-NEXT: retq
%in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
%in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
%in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
index 60d5f74c7a364d..1b9e9b200a9e3c 100644
--- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
+++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
@@ -4201,27 +4201,43 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask7(<8 x double>
ret <4 x double> %res
}
define <2 x double> @test_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec) {
-; CHECK-LABEL: test_8xdouble_to_2xdouble_perm_mask0:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm1
-; CHECK-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; CHECK-FAST-LABEL: test_8xdouble_to_2xdouble_perm_mask0:
+; CHECK-FAST: # %bb.0:
+; CHECK-FAST-NEXT: vmovaps {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0]
+; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; CHECK-FAST-NEXT: vzeroupper
+; CHECK-FAST-NEXT: retq
+;
+; CHECK-FAST-PERLANE-LABEL: test_8xdouble_to_2xdouble_perm_mask0:
+; CHECK-FAST-PERLANE: # %bb.0:
+; CHECK-FAST-PERLANE-NEXT: vextractf32x4 $3, %zmm0, %xmm1
+; CHECK-FAST-PERLANE-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-FAST-PERLANE-NEXT: vzeroupper
+; CHECK-FAST-PERLANE-NEXT: retq
%res = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6>
ret <2 x double> %res
}
define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
-; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask0:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
-; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm3
-; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vcmpeqpd %xmm4, %xmm2, %k1
-; CHECK-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],xmm3[0]
-; CHECK-NEXT: vmovapd %xmm1, %xmm0
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; CHECK-FAST-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask0:
+; CHECK-FAST: # %bb.0:
+; CHECK-FAST-NEXT: vmovapd {{.*#+}} xmm3 = [0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0]
+; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm3, %zmm0
+; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-FAST-NEXT: vcmpeqpd %xmm3, %xmm2, %k1
+; CHECK-FAST-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-FAST-NEXT: vzeroupper
+; CHECK-FAST-NEXT: retq
+;
+; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask0:
+; CHECK-FAST-PERLANE: # %bb.0:
+; CHECK-FAST-PERLANE-NEXT: vextractf32x4 $3, %zmm0, %xmm3
+; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm4, %xmm4, %xmm4
+; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %xmm4, %xmm2, %k1
+; CHECK-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],xmm3[0]
+; CHECK-FAST-PERLANE-NEXT: vmovapd %xmm1, %xmm0
+; CHECK-FAST-PERLANE-NEXT: vzeroupper
+; CHECK-FAST-PERLANE-NEXT: retq
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6>
%cmp = fcmp oeq <2 x double> %mask, zeroinitializer
%res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
@@ -4229,15 +4245,24 @@ define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mask0(<8 x double> %v
}
define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec, <2 x double> %mask) {
-; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mask0:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm2
-; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
-; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm2[0]
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mask0:
+; CHECK-FAST: # %bb.0:
+; CHECK-FAST-NEXT: vmovapd {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0]
+; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-FAST-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
+; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
+; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; CHECK-FAST-NEXT: vzeroupper
+; CHECK-FAST-NEXT: retq
+;
+; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mask0:
+; CHECK-FAST-PERLANE: # %bb.0:
+; CHECK-FAST-PERLANE-NEXT: vextractf32x4 $3, %zmm0, %xmm2
+; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
+; CHECK-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm2[0]
+; CHECK-FAST-PERLANE-NEXT: vzeroupper
+; CHECK-FAST-PERLANE-NEXT: retq
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6>
%cmp = fcmp oeq <2 x double> %mask, zeroinitializer
%res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
diff --git a/llvm/test/CodeGen/X86/kshift.ll b/llvm/test/CodeGen/X86/kshift.ll
index f4efacc1946cff..a3b5d8aee03c10 100644
--- a/llvm/test/CodeGen/X86/kshift.ll
+++ b/llvm/test/CodeGen/X86/kshift.ll
@@ -271,8 +271,7 @@ define i64 @kshiftl_v64i1_63(<64 x i8> %x, <64 x i8> %y) {
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kshiftlw $15, %k0, %k1
-; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm0
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT: vextracti32x4 $3, %zmm1, %xmm0
; KNL-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
@@ -562,8 +561,7 @@ define i32 @kshiftr_v32i1_31(<32 x i16> %x, <32 x i16> %y) {
define i64 @kshiftr_v64i1_63(<64 x i8> %x, <64 x i8> %y) {
; KNL-LABEL: kshiftr_v64i1_63:
; KNL: # %bb.0:
-; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT: vextracti32x4 $3, %zmm0, %xmm0
; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2
; KNL-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
index bad0b411f68a95..1b80fcdedb43f2 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
@@ -433,8 +433,7 @@ define <8 x i32> @test_v16i32_1_3_5_7_9_11_13_15(<16 x i32> %v) {
define <4 x i32> @test_v16i32_0_1_2_12 (<16 x i32> %v) {
; SLOW-LABEL: test_v16i32_0_1_2_12:
; SLOW: # %bb.0:
-; SLOW-NEXT: vextractf64x4 $1, %zmm0, %ymm1
-; SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1
+; SLOW-NEXT: vextractf32x4 $3, %zmm0, %xmm1
; SLOW-NEXT: vbroadcastss %xmm1, %xmm1
; SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
; SLOW-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
index 2387e05729661e..97b262cc7ac5c3 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
@@ -2189,13 +2189,21 @@ define <4 x i64> @test_v8i64_1257 (<8 x i64> %v) {
}
define <2 x i64> @test_v8i64_2_5 (<8 x i64> %v) {
-; ALL-LABEL: test_v8i64_2_5:
-; ALL: # %bb.0:
-; ALL-NEXT: vextractf32x4 $2, %zmm0, %xmm1
-; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
-; ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; ALL-NEXT: vzeroupper
-; ALL-NEXT: ret{{[l|q]}}
+; AVX512F-LABEL: test_v8i64_2_5:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovaps {{.*#+}} xmm1 = [2,5]
+; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512F-32-LABEL: test_v8i64_2_5:
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vmovaps {{.*#+}} xmm1 = [2,0,5,0]
+; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512F-32-NEXT: vzeroupper
+; AVX512F-32-NEXT: retl
%res = shufflevector <8 x i64> %v, <8 x i64> undef, <2 x i32> <i32 2, i32 5>
ret <2 x i64> %res
}
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
index 8cf277aa9796e8..ce092f9d343fc6 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
@@ -2478,8 +2478,7 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
@@ -2631,8 +2630,7 @@ define void @vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16(ptr %in
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15]
; AVX512BW-NEXT: vpbroadcastb %xmm0, %xmm2
@@ -2775,8 +2773,7 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; AVX512BW-NEXT: vpbroadcastd %xmm0, %ymm2
; AVX512BW-NEXT: movl $286331153, %eax # imm = 0x11111111
; AVX512BW-NEXT: kmovd %eax, %k1
@@ -2927,8 +2924,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.v
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
; AVX512BW-NEXT: vpbroadcastb %xmm0, %xmm2
@@ -3071,8 +3067,7 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; AVX512BW-NEXT: vpbroadcastq %xmm0, %ymm2
; AVX512BW-NEXT: movl $16843009, %eax # imm = 0x1010101
; AVX512BW-NEXT: kmovd %eax, %k1
@@ -3222,8 +3217,7 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in.
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
; AVX512BW-NEX...
[truncated]
|
; AVX512BW-SLOW-NEXT: vextracti32x4 $3, %zmm0, %xmm1 | ||
; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1] | ||
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4,5,6,7] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is there a regression?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't believe so - as I explained in the summary, we've reduced the length of the shuffle chain by an instruction, so it now falls below the threshold that we use for targets with slow cross-lane-variable shuffles. The fast version below is unchanged.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
…XTRACT_SUBVECTOR(V,C1+C2) (llvm#111685) Extract from the original source vector whenever possible. This removes a number of dependency bottlenecks and helps a number of shuffle combining cases: either by allowing us to avoid a cross-lane variable shuffle on a slow target by keeping the instruction count below the threshold, or on fast targets make it easier to recognise that the subvectors all came form the same source.
Extract from the original source vector whenever possible.
This removes a number of dependency bottlenecks and helps a number of shuffle combining cases: either by allowing us to avoid a cross-lane variable shuffle on a slow target by keeping the instruction count below the threshold, or on fast targets make it easier to recognise that the subvectors all came form the same source.