diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index efe5c2464dc00..0f451d7797d45 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -30057,6 +30057,23 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3}); } + // Build a map of inrange constant amounts with element mask where they occur. + SmallDenseMap UniqueCstAmt; + if (ConstantAmt) { + for (unsigned I = 0; I != NumElts; ++I) { + SDValue A = Amt.getOperand(I); + if (A.isUndef() || A->getAsAPIntVal().uge(EltSizeInBits)) + continue; + unsigned CstAmt = A->getAsAPIntVal().getZExtValue(); + if (UniqueCstAmt.count(CstAmt)) { + UniqueCstAmt[CstAmt].setBit(I); + continue; + } + UniqueCstAmt[CstAmt] = APInt::getOneBitSet(NumElts, I); + } + assert(!UniqueCstAmt.empty() && "Illegal constant shift amounts"); + } + // If possible, lower this shift as a sequence of two shifts by // constant plus a BLENDing shuffle instead of scalarizing it. // Example: @@ -30067,45 +30084,31 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, // // The advantage is that the two shifts from the example would be // lowered as X86ISD::VSRLI nodes in parallel before blending. - if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32 || - (VT == MVT::v16i16 && Subtarget.hasInt256()))) { - SDValue Amt1, Amt2; - SmallVector ShuffleMask; - for (unsigned i = 0; i != NumElts; ++i) { - SDValue A = Amt->getOperand(i); - if (A.isUndef()) { - ShuffleMask.push_back(SM_SentinelUndef); - continue; - } - if (!Amt1 || Amt1 == A) { - ShuffleMask.push_back(i); - Amt1 = A; - continue; - } - if (!Amt2 || Amt2 == A) { - ShuffleMask.push_back(i + NumElts); - Amt2 = A; - continue; - } - break; + if (UniqueCstAmt.size() == 2 && + (VT == MVT::v8i16 || VT == MVT::v4i32 || + (VT == MVT::v16i16 && Subtarget.hasInt256()))) { + unsigned AmtA = UniqueCstAmt.begin()->first; + unsigned AmtB = std::next(UniqueCstAmt.begin())->first; + const APInt &MaskA = UniqueCstAmt.begin()->second; + const APInt &MaskB = std::next(UniqueCstAmt.begin())->second; + SmallVector ShuffleMask(NumElts, SM_SentinelUndef); + for (unsigned I = 0; I != NumElts; ++I) { + if (MaskA[I]) + ShuffleMask[I] = I; + if (MaskB[I]) + ShuffleMask[I] = I + NumElts; } // Only perform this blend if we can perform it without loading a mask. - if (ShuffleMask.size() == NumElts && Amt1 && Amt2 && - (VT != MVT::v16i16 || + if ((VT != MVT::v16i16 || is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) && (VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL || canWidenShuffleElements(ShuffleMask))) { - auto *Cst1 = dyn_cast(Amt1); - auto *Cst2 = dyn_cast(Amt2); - if (Cst1 && Cst2 && Cst1->getAPIntValue().ult(EltSizeInBits) && - Cst2->getAPIntValue().ult(EltSizeInBits)) { - SDValue Shift1 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, - Cst1->getZExtValue(), DAG); - SDValue Shift2 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, - Cst2->getZExtValue(), DAG); - return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask); - } + SDValue Shift1 = + getTargetVShiftByConstNode(X86OpcI, dl, VT, R, AmtA, DAG); + SDValue Shift2 = + getTargetVShiftByConstNode(X86OpcI, dl, VT, R, AmtB, DAG); + return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask); } } diff --git a/llvm/test/CodeGen/X86/vec_shift6.ll b/llvm/test/CodeGen/X86/vec_shift6.ll index 59bc3940fcb31..48ed39e5da88f 100644 --- a/llvm/test/CodeGen/X86/vec_shift6.ll +++ b/llvm/test/CodeGen/X86/vec_shift6.ll @@ -22,15 +22,27 @@ define <8 x i16> @test1(<8 x i16> %a) { ret <8 x i16> %shl } +; Only two legal shift amounts, so we can lower to shuffle(psllw(),psllw()) + define <8 x i16> @test2(<8 x i16> %a) { -; SSE-LABEL: test2: -; SSE: # %bb.0: -; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,u,1,1,2,u,u,2] -; SSE-NEXT: retq +; SSE2-LABEL: test2: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psllw $1, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; SSE2-NEXT: retq +; +; SSE41-LABEL: test2: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psllw $1, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: retq ; ; AVX-LABEL: test2: ; AVX: # %bb.0: -; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,u,1,1,2,u,u,2] +; AVX-NEXT: vpsllw $1, %xmm0, %xmm1 +; AVX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX-NEXT: retq %shl = shl <8 x i16> %a, ret <8 x i16> %shl @@ -43,17 +55,18 @@ define <8 x i16> @test2(<8 x i16> %a) { define <4 x i32> @test3(<4 x i32> %a) { ; SSE2-LABEL: test3: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pslld $1, %xmm1 +; SSE2-NEXT: pslld $2, %xmm0 +; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE2-NEXT: retq ; ; SSE41-LABEL: test3: ; SSE41: # %bb.0: -; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pslld $2, %xmm1 +; SSE41-NEXT: pslld $1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; SSE41-NEXT: retq ; ; AVX-LABEL: test3: diff --git a/llvm/test/CodeGen/X86/vector-fshl-sub128.ll b/llvm/test/CodeGen/X86/vector-fshl-sub128.ll index d8e45ed9151d8..eb4d84b8d7dd6 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-sub128.ll @@ -337,7 +337,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; SSE41-NEXT: movdqa %xmm1, %xmm2 ; SSE41-NEXT: psrld $27, %xmm2 ; SSE41-NEXT: psrld $28, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] ; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: por %xmm2, %xmm0 ; SSE41-NEXT: retq @@ -346,7 +346,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vpsrld $27, %xmm1, %xmm2 ; AVX1-NEXT: vpsrld $28, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-fshr-sub128.ll b/llvm/test/CodeGen/X86/vector-fshr-sub128.ll index a6067a960fc0d..58dc17988b646 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-sub128.ll @@ -379,16 +379,11 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; SSE2-NEXT: psrld $4, %xmm3 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[2,3] -; SSE2-NEXT: movl $268435456, %eax # imm = 0x10000000 -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: pmuludq %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE2-NEXT: pslld $28, %xmm0 +; SSE2-NEXT: pslld $27, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: constant_funnnel_v2i32: @@ -400,7 +395,10 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; SSE41-NEXT: psrld $4, %xmm3 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] -; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pslld $27, %xmm1 +; SSE41-NEXT: pslld $28, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] ; SSE41-NEXT: por %xmm3, %xmm0 ; SSE41-NEXT: retq ; @@ -411,7 +409,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; AVX1-NEXT: vpsrld $4, %xmm1, %xmm3 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpslld $27, %xmm0, %xmm2 +; AVX1-NEXT: vpslld $28, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; @@ -482,22 +482,17 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; ; X86-SSE2-LABEL: constant_funnnel_v2i32: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 -; X86-SSE2-NEXT: psrld $5, %xmm3 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 -; X86-SSE2-NEXT: psrld $4, %xmm2 -; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; X86-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[2,3] -; X86-SSE2-NEXT: movl $268435456, %eax # imm = 0x10000000 -; X86-SSE2-NEXT: movd %eax, %xmm1 -; X86-SSE2-NEXT: pmuludq %xmm0, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X86-SSE2-NEXT: por %xmm2, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: psrld $5, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X86-SSE2-NEXT: psrld $4, %xmm3 +; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; X86-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[2,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-SSE2-NEXT: pslld $28, %xmm0 +; X86-SSE2-NEXT: pslld $27, %xmm1 +; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-SSE2-NEXT: por %xmm3, %xmm0 ; X86-SSE2-NEXT: retl %res = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> ) ret <2 x i32> %res