Skip to content

Commit 7a9f53c

Browse files
committed
[X86] combineBROADCAST_LOAD - merge across chains (REAPPLIED) (#128209)
Remove the restriction when reusing wider BROADCAST_LOAD nodes that both nodes couldn't have uses of their load chains - use makeEquivalentMemoryOrdering to merge the chains instead. Reapplied - move makeEquivalentMemoryOrdering prior to the CombineTo call to ensure that the original node hasn't already been removed. Fixes asan use-after-poison error reported in #128380 / 50b0669.
1 parent 301fe47 commit 7a9f53c

File tree

3 files changed

+128
-155
lines changed

3 files changed

+128
-155
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -59357,21 +59357,14 @@ static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG,
5935759357
return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);
5935859358
}
5935959359

59360-
// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract
59361-
// from. Limit this to cases where the loads have the same input chain and the
59362-
// output chains are unused. This avoids any memory ordering issues.
59360+
// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract.
5936359361
static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG,
5936459362
TargetLowering::DAGCombinerInfo &DCI) {
5936559363
assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||
5936659364
N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
5936759365
"Unknown broadcast load type");
5936859366

59369-
// Only do this if the chain result is unused.
59370-
if (N->hasAnyUseOfValue(1))
59371-
return SDValue();
59372-
5937359367
auto *MemIntrin = cast<MemIntrinsicSDNode>(N);
59374-
5937559368
SDValue Ptr = MemIntrin->getBasePtr();
5937659369
SDValue Chain = MemIntrin->getChain();
5937759370
EVT VT = N->getSimpleValueType(0);
@@ -59385,12 +59378,15 @@ static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG,
5938559378
cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
5938659379
cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
5938759380
MemVT.getSizeInBits() &&
59388-
!User->hasAnyUseOfValue(1) &&
5938959381
User->getValueSizeInBits(0).getFixedValue() > VT.getFixedSizeInBits()) {
59382+
assert(cast<MemIntrinsicSDNode>(User)->isSimple() &&
59383+
MemIntrin->isSimple() && "Illegal broadcast load type");
59384+
DAG.makeEquivalentMemoryOrdering(SDValue(N, 1), SDValue(User, 1));
5939059385
SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
5939159386
VT.getSizeInBits());
5939259387
Extract = DAG.getBitcast(VT, Extract);
59393-
return DCI.CombineTo(N, Extract, SDValue(User, 1));
59388+
Extract = DCI.CombineTo(N, Extract, SDValue(User, 1));
59389+
return Extract;
5939459390
}
5939559391

5939659392
return SDValue();

llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll

Lines changed: 88 additions & 105 deletions
Original file line numberDiff line numberDiff line change
@@ -1888,15 +1888,14 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in
18881888
;
18891889
; AVX2-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24:
18901890
; AVX2: # %bb.0:
1891-
; AVX2-NEXT: vpbroadcastb (%rdi), %xmm0
1892-
; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
1893-
; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
1894-
; AVX2-NEXT: vpbroadcastb (%rdi), %ymm2
1895-
; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
1896-
; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1
1897-
; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
1898-
; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx)
1899-
; AVX2-NEXT: vmovdqa %ymm1, (%rdx)
1891+
; AVX2-NEXT: vmovdqa 48(%rdi), %xmm0
1892+
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
1893+
; AVX2-NEXT: vpbroadcastb (%rdi), %ymm1
1894+
; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
1895+
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
1896+
; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
1897+
; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx)
1898+
; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
19001899
; AVX2-NEXT: vzeroupper
19011900
; AVX2-NEXT: retq
19021901
;
@@ -2112,15 +2111,14 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
21122111
;
21132112
; AVX2-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12:
21142113
; AVX2: # %bb.0:
2115-
; AVX2-NEXT: vpbroadcastb (%rdi), %xmm0
2116-
; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
2117-
; AVX2-NEXT: vpbroadcastb (%rdi), %ymm2
2118-
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
2119-
; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
2120-
; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1
2121-
; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
2122-
; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx)
2123-
; AVX2-NEXT: vmovdqa %ymm1, (%rdx)
2114+
; AVX2-NEXT: vmovdqa 48(%rdi), %xmm0
2115+
; AVX2-NEXT: vpbroadcastb (%rdi), %ymm1
2116+
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
2117+
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
2118+
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2119+
; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2120+
; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx)
2121+
; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
21242122
; AVX2-NEXT: vzeroupper
21252123
; AVX2-NEXT: retq
21262124
;
@@ -2237,33 +2235,29 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.e
22372235
;
22382236
; AVX512F-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8:
22392237
; AVX512F: # %bb.0:
2240-
; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0
2241-
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
2242-
; AVX512F-NEXT: vmovdqa (%rdi), %xmm1
2243-
; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
2244-
; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
2245-
; AVX512F-NEXT: vpbroadcastb (%rdi), %xmm2
2246-
; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
2247-
; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1
2248-
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
2249-
; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx)
2250-
; AVX512F-NEXT: vmovdqa %ymm1, (%rdx)
2238+
; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
2239+
; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2240+
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
2241+
; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1
2242+
; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2243+
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2244+
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2245+
; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx)
2246+
; AVX512F-NEXT: vmovdqa %ymm0, (%rdx)
22512247
; AVX512F-NEXT: vzeroupper
22522248
; AVX512F-NEXT: retq
22532249
;
22542250
; AVX512DQ-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8:
22552251
; AVX512DQ: # %bb.0:
2256-
; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0
2257-
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
2258-
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1
2259-
; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
2260-
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
2261-
; AVX512DQ-NEXT: vpbroadcastb (%rdi), %xmm2
2262-
; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
2263-
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1
2264-
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
2265-
; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx)
2266-
; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx)
2252+
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
2253+
; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2254+
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
2255+
; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm1
2256+
; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2257+
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2258+
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2259+
; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx)
2260+
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
22672261
; AVX512DQ-NEXT: vzeroupper
22682262
; AVX512DQ-NEXT: retq
22692263
;
@@ -2272,9 +2266,8 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.e
22722266
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
22732267
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
22742268
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
2275-
; AVX512BW-NEXT: vpbroadcastb (%rdi), %xmm1
2276-
; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
22772269
; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm1
2270+
; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
22782271
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
22792272
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
22802273
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
@@ -2339,15 +2332,14 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.e
23392332
;
23402333
; AVX2-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6:
23412334
; AVX2: # %bb.0:
2342-
; AVX2-NEXT: vpbroadcastb (%rdi), %xmm0
2343-
; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
2344-
; AVX2-NEXT: vpbroadcastb (%rdi), %ymm2
2345-
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2346-
; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
2347-
; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1
2348-
; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
2349-
; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx)
2350-
; AVX2-NEXT: vmovdqa %ymm1, (%rdx)
2335+
; AVX2-NEXT: vmovdqa 48(%rdi), %xmm0
2336+
; AVX2-NEXT: vpbroadcastb (%rdi), %ymm1
2337+
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2338+
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
2339+
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2340+
; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2341+
; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx)
2342+
; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
23512343
; AVX2-NEXT: vzeroupper
23522344
; AVX2-NEXT: retq
23532345
;
@@ -2462,33 +2454,29 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in.
24622454
;
24632455
; AVX512F-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4:
24642456
; AVX512F: # %bb.0:
2465-
; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0
2466-
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
2467-
; AVX512F-NEXT: vmovdqa (%rdi), %xmm1
2468-
; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
2469-
; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
2470-
; AVX512F-NEXT: vpbroadcastb (%rdi), %xmm2
2471-
; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
2472-
; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1
2473-
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
2474-
; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx)
2475-
; AVX512F-NEXT: vmovdqa %ymm1, (%rdx)
2457+
; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
2458+
; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2459+
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
2460+
; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1
2461+
; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2462+
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2463+
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2464+
; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx)
2465+
; AVX512F-NEXT: vmovdqa %ymm0, (%rdx)
24762466
; AVX512F-NEXT: vzeroupper
24772467
; AVX512F-NEXT: retq
24782468
;
24792469
; AVX512DQ-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4:
24802470
; AVX512DQ: # %bb.0:
2481-
; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0
2482-
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
2483-
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1
2484-
; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
2485-
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
2486-
; AVX512DQ-NEXT: vpbroadcastb (%rdi), %xmm2
2487-
; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
2488-
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1
2489-
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
2490-
; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx)
2491-
; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx)
2471+
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
2472+
; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2473+
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
2474+
; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm1
2475+
; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2476+
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2477+
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2478+
; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx)
2479+
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
24922480
; AVX512DQ-NEXT: vzeroupper
24932481
; AVX512DQ-NEXT: retq
24942482
;
@@ -2497,9 +2485,8 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in.
24972485
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
24982486
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
24992487
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
2500-
; AVX512BW-NEXT: vpbroadcastb (%rdi), %xmm1
2501-
; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
25022488
; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm1
2489+
; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
25032490
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
25042491
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
25052492
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
@@ -2788,14 +2775,13 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i
27882775
;
27892776
; AVX2-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
27902777
; AVX2: # %bb.0:
2791-
; AVX2-NEXT: vpbroadcastw (%rdi), %xmm0
2792-
; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
2793-
; AVX2-NEXT: vpbroadcastw (%rdi), %ymm2
2794-
; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
2795-
; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1
2796-
; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
2797-
; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx)
2798-
; AVX2-NEXT: vmovdqa %ymm1, (%rdx)
2778+
; AVX2-NEXT: vmovdqa 48(%rdi), %xmm0
2779+
; AVX2-NEXT: vpbroadcastw (%rdi), %ymm1
2780+
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
2781+
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2782+
; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2783+
; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx)
2784+
; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
27992785
; AVX2-NEXT: vzeroupper
28002786
; AVX2-NEXT: retq
28012787
;
@@ -2990,14 +2976,13 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
29902976
;
29912977
; AVX2-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
29922978
; AVX2: # %bb.0:
2993-
; AVX2-NEXT: vpbroadcastw (%rdi), %xmm0
2994-
; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
2995-
; AVX2-NEXT: vpbroadcastw (%rdi), %ymm2
2996-
; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15]
2997-
; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1
2998-
; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
2999-
; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx)
3000-
; AVX2-NEXT: vmovdqa %ymm1, (%rdx)
2979+
; AVX2-NEXT: vmovdqa 48(%rdi), %xmm0
2980+
; AVX2-NEXT: vpbroadcastw (%rdi), %ymm1
2981+
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15]
2982+
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2983+
; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2984+
; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx)
2985+
; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
30012986
; AVX2-NEXT: vzeroupper
30022987
; AVX2-NEXT: retq
30032988
;
@@ -3108,27 +3093,25 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
31083093
;
31093094
; AVX512F-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
31103095
; AVX512F: # %bb.0:
3111-
; AVX512F-NEXT: vpbroadcastw (%rdi), %xmm0
3096+
; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm0
31123097
; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7]
3113-
; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
3114-
; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm1
3115-
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
3116-
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
3117-
; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx)
3118-
; AVX512F-NEXT: vmovdqa %ymm0, (%rdx)
3098+
; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
3099+
; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1
3100+
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
3101+
; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx)
3102+
; AVX512F-NEXT: vmovdqa %ymm1, (%rdx)
31193103
; AVX512F-NEXT: vzeroupper
31203104
; AVX512F-NEXT: retq
31213105
;
31223106
; AVX512DQ-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
31233107
; AVX512DQ: # %bb.0:
3124-
; AVX512DQ-NEXT: vpbroadcastw (%rdi), %xmm0
3108+
; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm0
31253109
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7]
3126-
; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
3127-
; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm1
3128-
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
3129-
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
3130-
; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx)
3131-
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
3110+
; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
3111+
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1
3112+
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
3113+
; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx)
3114+
; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx)
31323115
; AVX512DQ-NEXT: vzeroupper
31333116
; AVX512DQ-NEXT: retq
31343117
;

0 commit comments

Comments
 (0)