Skip to content

Commit 3e4ee76

Browse files
authored
[X86] Fold EXTRACT_SUBVECTOR(ONEUSE(EXTRACT_SUBVECTOR(V,C1))),C2) - EXTRACT_SUBVECTOR(V,C1+C2) (#111685)
Extract from the original source vector whenever possible. This removes a number of dependency bottlenecks and helps a number of shuffle combining cases: either by allowing us to avoid a cross-lane variable shuffle on a slow target by keeping the instruction count below the threshold, or on fast targets make it easier to recognise that the subvectors all came form the same source.
1 parent 993de55 commit 3e4ee76

7 files changed

+147
-97
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57583,6 +57583,14 @@ static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
5758357583
if (InVec.getOpcode() == ISD::BUILD_VECTOR)
5758457584
return DAG.getBuildVector(VT, DL, InVec->ops().slice(IdxVal, NumSubElts));
5758557585

57586+
// EXTRACT_SUBVECTOR(EXTRACT_SUBVECTOR(V,C1)),C2) - EXTRACT_SUBVECTOR(V,C1+C2)
57587+
if (IdxVal != 0 && InVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
57588+
InVec.hasOneUse() && TLI.isTypeLegal(VT) &&
57589+
TLI.isTypeLegal(InVec.getOperand(0).getValueType())) {
57590+
unsigned NewIdx = IdxVal + InVec.getConstantOperandVal(1);
57591+
return extractSubVector(InVec.getOperand(0), NewIdx, DAG, DL, SizeInBits);
57592+
}
57593+
5758657594
// If we are extracting from an insert into a larger vector, replace with a
5758757595
// smaller insert if we don't access less than the original subvector. Don't
5758857596
// do this for i1 vectors.

llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll

Lines changed: 32 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -2470,8 +2470,7 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in
24702470
; AVX512BW: # %bb.0:
24712471
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
24722472
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
2473-
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
2474-
; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
2473+
; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
24752474
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
24762475
; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0
24772476
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
@@ -2609,8 +2608,7 @@ define void @vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16(ptr %in
26092608
; AVX512BW: # %bb.0:
26102609
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
26112610
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
2612-
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
2613-
; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
2611+
; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
26142612
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
26152613
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15]
26162614
; AVX512BW-NEXT: vpbroadcastb %xmm0, %xmm0
@@ -2740,8 +2738,7 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
27402738
; AVX512BW: # %bb.0:
27412739
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
27422740
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
2743-
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
2744-
; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
2741+
; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
27452742
; AVX512BW-NEXT: vpbroadcastd %xmm0, %ymm2
27462743
; AVX512BW-NEXT: movl $286331153, %eax # imm = 0x11111111
27472744
; AVX512BW-NEXT: kmovd %eax, %k1
@@ -2879,8 +2876,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.v
28792876
; AVX512BW: # %bb.0:
28802877
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
28812878
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
2882-
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
2883-
; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
2879+
; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
28842880
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
28852881
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
28862882
; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0
@@ -3010,8 +3006,7 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v
30103006
; AVX512BW: # %bb.0:
30113007
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
30123008
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
3013-
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
3014-
; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
3009+
; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
30153010
; AVX512BW-NEXT: vpbroadcastq %xmm0, %ymm2
30163011
; AVX512BW-NEXT: movl $16843009, %eax # imm = 0x1010101
30173012
; AVX512BW-NEXT: kmovd %eax, %k1
@@ -3148,8 +3143,7 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in.
31483143
; AVX512BW: # %bb.0:
31493144
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
31503145
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
3151-
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
3152-
; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
3146+
; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
31533147
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
31543148
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
31553149
; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0
@@ -3290,8 +3284,7 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i
32903284
; AVX512BW: # %bb.0:
32913285
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
32923286
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
3293-
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
3294-
; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
3287+
; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
32953288
; AVX512BW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
32963289
; AVX512BW-NEXT: movl $65537, %eax # imm = 0x10001
32973290
; AVX512BW-NEXT: kmovd %eax, %k1
@@ -3407,8 +3400,7 @@ define void @vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2(ptr %i
34073400
; AVX512BW: # %bb.0:
34083401
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
34093402
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
3410-
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
3411-
; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
3403+
; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
34123404
; AVX512BW-NEXT: movw $1, %ax
34133405
; AVX512BW-NEXT: kmovd %eax, %k1
34143406
; AVX512BW-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1}
@@ -4565,17 +4557,30 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i
45654557
; AVX512DQ-NEXT: vzeroupper
45664558
; AVX512DQ-NEXT: retq
45674559
;
4568-
; AVX512BW-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
4569-
; AVX512BW: # %bb.0:
4570-
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
4571-
; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,13,14,15,0,1,2,3]
4572-
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
4573-
; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm1
4574-
; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
4575-
; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
4576-
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
4577-
; AVX512BW-NEXT: vzeroupper
4578-
; AVX512BW-NEXT: retq
4560+
; AVX512BW-SLOW-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
4561+
; AVX512BW-SLOW: # %bb.0:
4562+
; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
4563+
; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
4564+
; AVX512BW-SLOW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
4565+
; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
4566+
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4,5,6,7]
4567+
; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
4568+
; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
4569+
; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
4570+
; AVX512BW-SLOW-NEXT: vzeroupper
4571+
; AVX512BW-SLOW-NEXT: retq
4572+
;
4573+
; AVX512BW-FAST-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
4574+
; AVX512BW-FAST: # %bb.0:
4575+
; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
4576+
; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,13,14,15,0,1,2,3]
4577+
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
4578+
; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1
4579+
; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
4580+
; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
4581+
; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
4582+
; AVX512BW-FAST-NEXT: vzeroupper
4583+
; AVX512BW-FAST-NEXT: retq
45794584
%in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
45804585
%in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
45814586
%in.vec = add <64 x i8> %in.vec.base, %in.vec.bias

llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll

Lines changed: 51 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -4201,43 +4201,68 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask7(<8 x double>
42014201
ret <4 x double> %res
42024202
}
42034203
define <2 x double> @test_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec) {
4204-
; CHECK-LABEL: test_8xdouble_to_2xdouble_perm_mask0:
4205-
; CHECK: # %bb.0:
4206-
; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1
4207-
; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm1
4208-
; CHECK-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4209-
; CHECK-NEXT: vzeroupper
4210-
; CHECK-NEXT: retq
4204+
; CHECK-FAST-LABEL: test_8xdouble_to_2xdouble_perm_mask0:
4205+
; CHECK-FAST: # %bb.0:
4206+
; CHECK-FAST-NEXT: vmovaps {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0]
4207+
; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm1, %zmm0
4208+
; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
4209+
; CHECK-FAST-NEXT: vzeroupper
4210+
; CHECK-FAST-NEXT: retq
4211+
;
4212+
; CHECK-FAST-PERLANE-LABEL: test_8xdouble_to_2xdouble_perm_mask0:
4213+
; CHECK-FAST-PERLANE: # %bb.0:
4214+
; CHECK-FAST-PERLANE-NEXT: vextractf32x4 $3, %zmm0, %xmm1
4215+
; CHECK-FAST-PERLANE-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4216+
; CHECK-FAST-PERLANE-NEXT: vzeroupper
4217+
; CHECK-FAST-PERLANE-NEXT: retq
42114218
%res = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6>
42124219
ret <2 x double> %res
42134220
}
42144221
define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
4215-
; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask0:
4216-
; CHECK: # %bb.0:
4217-
; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
4218-
; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm3
4219-
; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4
4220-
; CHECK-NEXT: vcmpeqpd %xmm4, %xmm2, %k1
4221-
; CHECK-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],xmm3[0]
4222-
; CHECK-NEXT: vmovapd %xmm1, %xmm0
4223-
; CHECK-NEXT: vzeroupper
4224-
; CHECK-NEXT: retq
4222+
; CHECK-FAST-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask0:
4223+
; CHECK-FAST: # %bb.0:
4224+
; CHECK-FAST-NEXT: vmovapd {{.*#+}} xmm3 = [0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0]
4225+
; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm3, %zmm0
4226+
; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4227+
; CHECK-FAST-NEXT: vcmpeqpd %xmm3, %xmm2, %k1
4228+
; CHECK-FAST-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
4229+
; CHECK-FAST-NEXT: vzeroupper
4230+
; CHECK-FAST-NEXT: retq
4231+
;
4232+
; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask0:
4233+
; CHECK-FAST-PERLANE: # %bb.0:
4234+
; CHECK-FAST-PERLANE-NEXT: vextractf32x4 $3, %zmm0, %xmm3
4235+
; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm4, %xmm4, %xmm4
4236+
; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %xmm4, %xmm2, %k1
4237+
; CHECK-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],xmm3[0]
4238+
; CHECK-FAST-PERLANE-NEXT: vmovapd %xmm1, %xmm0
4239+
; CHECK-FAST-PERLANE-NEXT: vzeroupper
4240+
; CHECK-FAST-PERLANE-NEXT: retq
42254241
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6>
42264242
%cmp = fcmp oeq <2 x double> %mask, zeroinitializer
42274243
%res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
42284244
ret <2 x double> %res
42294245
}
42304246

42314247
define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec, <2 x double> %mask) {
4232-
; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mask0:
4233-
; CHECK: # %bb.0:
4234-
; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
4235-
; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm2
4236-
; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4237-
; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
4238-
; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm2[0]
4239-
; CHECK-NEXT: vzeroupper
4240-
; CHECK-NEXT: retq
4248+
; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mask0:
4249+
; CHECK-FAST: # %bb.0:
4250+
; CHECK-FAST-NEXT: vmovapd {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0]
4251+
; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4252+
; CHECK-FAST-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
4253+
; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
4254+
; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
4255+
; CHECK-FAST-NEXT: vzeroupper
4256+
; CHECK-FAST-NEXT: retq
4257+
;
4258+
; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mask0:
4259+
; CHECK-FAST-PERLANE: # %bb.0:
4260+
; CHECK-FAST-PERLANE-NEXT: vextractf32x4 $3, %zmm0, %xmm2
4261+
; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4262+
; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
4263+
; CHECK-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm2[0]
4264+
; CHECK-FAST-PERLANE-NEXT: vzeroupper
4265+
; CHECK-FAST-PERLANE-NEXT: retq
42414266
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6>
42424267
%cmp = fcmp oeq <2 x double> %mask, zeroinitializer
42434268
%res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer

llvm/test/CodeGen/X86/kshift.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -271,8 +271,7 @@ define i64 @kshiftl_v64i1_63(<64 x i8> %x, <64 x i8> %y) {
271271
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
272272
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
273273
; KNL-NEXT: kshiftlw $15, %k0, %k1
274-
; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm0
275-
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
274+
; KNL-NEXT: vextracti32x4 $3, %zmm1, %xmm0
276275
; KNL-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0
277276
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
278277
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
@@ -562,8 +561,7 @@ define i32 @kshiftr_v32i1_31(<32 x i16> %x, <32 x i16> %y) {
562561
define i64 @kshiftr_v64i1_63(<64 x i8> %x, <64 x i8> %y) {
563562
; KNL-LABEL: kshiftr_v64i1_63:
564563
; KNL: # %bb.0:
565-
; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
566-
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
564+
; KNL-NEXT: vextracti32x4 $3, %zmm0, %xmm0
567565
; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2
568566
; KNL-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0
569567
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0

llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -433,8 +433,7 @@ define <8 x i32> @test_v16i32_1_3_5_7_9_11_13_15(<16 x i32> %v) {
433433
define <4 x i32> @test_v16i32_0_1_2_12 (<16 x i32> %v) {
434434
; SLOW-LABEL: test_v16i32_0_1_2_12:
435435
; SLOW: # %bb.0:
436-
; SLOW-NEXT: vextractf64x4 $1, %zmm0, %ymm1
437-
; SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1
436+
; SLOW-NEXT: vextractf32x4 $3, %zmm0, %xmm1
438437
; SLOW-NEXT: vbroadcastss %xmm1, %xmm1
439438
; SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
440439
; SLOW-NEXT: vzeroupper

llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2189,13 +2189,21 @@ define <4 x i64> @test_v8i64_1257 (<8 x i64> %v) {
21892189
}
21902190

21912191
define <2 x i64> @test_v8i64_2_5 (<8 x i64> %v) {
2192-
; ALL-LABEL: test_v8i64_2_5:
2193-
; ALL: # %bb.0:
2194-
; ALL-NEXT: vextractf32x4 $2, %zmm0, %xmm1
2195-
; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
2196-
; ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2197-
; ALL-NEXT: vzeroupper
2198-
; ALL-NEXT: ret{{[l|q]}}
2192+
; AVX512F-LABEL: test_v8i64_2_5:
2193+
; AVX512F: # %bb.0:
2194+
; AVX512F-NEXT: vmovaps {{.*#+}} xmm1 = [2,5]
2195+
; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
2196+
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2197+
; AVX512F-NEXT: vzeroupper
2198+
; AVX512F-NEXT: retq
2199+
;
2200+
; AVX512F-32-LABEL: test_v8i64_2_5:
2201+
; AVX512F-32: # %bb.0:
2202+
; AVX512F-32-NEXT: vmovaps {{.*#+}} xmm1 = [2,0,5,0]
2203+
; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
2204+
; AVX512F-32-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2205+
; AVX512F-32-NEXT: vzeroupper
2206+
; AVX512F-32-NEXT: retl
21992207
%res = shufflevector <8 x i64> %v, <8 x i64> undef, <2 x i32> <i32 2, i32 5>
22002208
ret <2 x i64> %res
22012209
}

0 commit comments

Comments
 (0)