diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp index 5d27accdc198c..b7a20c351f5ff 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp @@ -309,3 +309,34 @@ void NVPTXInstPrinter::printProtoIdent(const MCInst *MI, int OpNum, const MCSymbol &Sym = cast(Expr)->getSymbol(); O << Sym.getName(); } + +void NVPTXInstPrinter::printPrmtMode(const MCInst *MI, int OpNum, + raw_ostream &O, const char *Modifier) { + const MCOperand &MO = MI->getOperand(OpNum); + int64_t Imm = MO.getImm(); + + switch (Imm) { + default: + return; + case NVPTX::PTXPrmtMode::NONE: + break; + case NVPTX::PTXPrmtMode::F4E: + O << ".f4e"; + break; + case NVPTX::PTXPrmtMode::B4E: + O << ".b4e"; + break; + case NVPTX::PTXPrmtMode::RC8: + O << ".rc8"; + break; + case NVPTX::PTXPrmtMode::ECL: + O << ".ecl"; + break; + case NVPTX::PTXPrmtMode::ECR: + O << ".ecr"; + break; + case NVPTX::PTXPrmtMode::RC16: + O << ".rc16"; + break; + } +} diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h index 49ad3f269229d..e6954f861cd10 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h @@ -47,6 +47,8 @@ class NVPTXInstPrinter : public MCInstPrinter { raw_ostream &O, const char *Modifier = nullptr); void printProtoIdent(const MCInst *MI, int OpNum, raw_ostream &O, const char *Modifier = nullptr); + void printPrmtMode(const MCInst *MI, int OpNum, raw_ostream &O, + const char *Modifier = nullptr); }; } diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h index 8dc68911fff0c..07ee34968b023 100644 --- a/llvm/lib/Target/NVPTX/NVPTX.h +++ b/llvm/lib/Target/NVPTX/NVPTX.h @@ -181,6 +181,18 @@ enum CmpMode { FTZ_FLAG = 0x100 }; } + +namespace PTXPrmtMode { +enum PrmtMode { + NONE, + F4E, + B4E, + RC8, + ECL, + ECR, + RC16, +}; +} } void initializeNVPTXDAGToDAGISelPass(PassRegistry &); } // namespace llvm diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 0aef2591c6e23..68391cdb6ff17 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -14,6 +14,7 @@ #include "MCTargetDesc/NVPTXBaseInfo.h" #include "NVPTXUtilities.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/IR/GlobalValue.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicsNVPTX.h" @@ -829,6 +830,7 @@ pickOpcodeForVT(MVT::SimpleValueType VT, unsigned Opcode_i8, case MVT::v2f16: case MVT::v2bf16: case MVT::v2i16: + case MVT::v4i8: return Opcode_i32; case MVT::f32: return Opcode_f32; @@ -910,7 +912,8 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { // Vector Setting unsigned vecType = NVPTX::PTXLdStInstCode::Scalar; if (SimpleVT.isVector()) { - assert(Isv2x16VT(LoadedVT) && "Unexpected vector type"); + assert((Isv2x16VT(LoadedVT) || LoadedVT == MVT::v4i8) && + "Unexpected vector type"); // v2f16/v2bf16/v2i16 is loaded using ld.b32 fromTypeWidth = 32; } @@ -1254,6 +1257,7 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) { SDLoc DL(N); SDNode *LD; SDValue Base, Offset, Addr; + EVT OrigType = N->getValueType(0); EVT EltVT = Mem->getMemoryVT(); unsigned NumElts = 1; @@ -1261,12 +1265,15 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) { NumElts = EltVT.getVectorNumElements(); EltVT = EltVT.getVectorElementType(); // vectors of 16bits type are loaded/stored as multiples of v2x16 elements. - if ((EltVT == MVT::f16 && N->getValueType(0) == MVT::v2f16) || - (EltVT == MVT::bf16 && N->getValueType(0) == MVT::v2bf16) || - (EltVT == MVT::i16 && N->getValueType(0) == MVT::v2i16)) { + if ((EltVT == MVT::f16 && OrigType == MVT::v2f16) || + (EltVT == MVT::bf16 && OrigType == MVT::v2bf16) || + (EltVT == MVT::i16 && OrigType == MVT::v2i16)) { assert(NumElts % 2 == 0 && "Vector must have even number of elements"); - EltVT = N->getValueType(0); + EltVT = OrigType; NumElts /= 2; + } else if (OrigType == MVT::v4i8) { + EltVT = OrigType; + NumElts = 1; } } @@ -1601,7 +1608,6 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) { // concept of sign-/zero-extension, so emulate it here by adding an explicit // CVT instruction. Ptxas should clean up any redundancies here. - EVT OrigType = N->getValueType(0); LoadSDNode *LdNode = dyn_cast(N); if (OrigType != EltVT && @@ -1679,7 +1685,8 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { MVT ScalarVT = SimpleVT.getScalarType(); unsigned toTypeWidth = ScalarVT.getSizeInBits(); if (SimpleVT.isVector()) { - assert(Isv2x16VT(StoreVT) && "Unexpected vector type"); + assert((Isv2x16VT(StoreVT) || StoreVT == MVT::v4i8) && + "Unexpected vector type"); // v2x16 is stored using st.b32 toTypeWidth = 32; } diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index b24aae4792ce6..36da2e7b40efa 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -221,6 +221,11 @@ static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL, llvm_unreachable("Unexpected type"); } NumElts /= 2; + } else if (EltVT.getSimpleVT() == MVT::i8 && + (NumElts % 4 == 0 || NumElts == 3)) { + // v*i8 are formally lowered as v4i8 + EltVT = MVT::v4i8; + NumElts = (NumElts + 3) / 4; } for (unsigned j = 0; j != NumElts; ++j) { ValueVTs.push_back(EltVT); @@ -458,6 +463,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass); addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass); addRegisterClass(MVT::v2i16, &NVPTX::Int32RegsRegClass); + addRegisterClass(MVT::v4i8, &NVPTX::Int32RegsRegClass); addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass); addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass); addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass); @@ -491,10 +497,26 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Expand); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i16, Expand); + setOperationAction(ISD::BUILD_VECTOR, MVT::v4i8, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i8, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i8, Custom); + // Only logical ops can be done on v4i8 directly, others must be done + // elementwise. + setOperationAction( + {ISD::ADD, ISD::MUL, ISD::ABS, ISD::SMIN, + ISD::SMAX, ISD::UMIN, ISD::UMAX, ISD::CTPOP, + ISD::CTLZ, ISD::ADD, ISD::SUB, ISD::MUL, + ISD::SHL, ISD::SREM, ISD::UREM, ISD::SDIV, + ISD::UDIV, ISD::SRA, ISD::SRL, ISD::MULHS, + ISD::MULHU, ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::SINT_TO_FP, + ISD::UINT_TO_FP}, + MVT::v4i8, Expand); + // Operations not directly supported by NVPTX. - for (MVT VT : - {MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32, MVT::f64, - MVT::i1, MVT::i8, MVT::i16, MVT::v2i16, MVT::i32, MVT::i64}) { + for (MVT VT : {MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32, + MVT::f64, MVT::i1, MVT::i8, MVT::i16, MVT::v2i16, MVT::v4i8, + MVT::i32, MVT::i64}) { setOperationAction(ISD::SELECT_CC, VT, Expand); setOperationAction(ISD::BR_CC, VT, Expand); } @@ -672,7 +694,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, // We have some custom DAG combine patterns for these nodes setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::FADD, ISD::MUL, ISD::SHL, - ISD::SREM, ISD::UREM, ISD::EXTRACT_VECTOR_ELT}); + ISD::SREM, ISD::UREM, ISD::EXTRACT_VECTOR_ELT, + ISD::VSELECT}); // setcc for f16x2 and bf16x2 needs special handling to prevent // legalizer's attempt to scalarize it due to v2i1 not being legal. @@ -881,6 +904,12 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { return "NVPTXISD::FUN_SHFR_CLAMP"; case NVPTXISD::IMAD: return "NVPTXISD::IMAD"; + case NVPTXISD::BFE: + return "NVPTXISD::BFE"; + case NVPTXISD::BFI: + return "NVPTXISD::BFI"; + case NVPTXISD::PRMT: + return "NVPTXISD::PRMT"; case NVPTXISD::SETP_F16X2: return "NVPTXISD::SETP_F16X2"; case NVPTXISD::Dummy: @@ -2150,58 +2179,98 @@ NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { return DAG.getBuildVector(Node->getValueType(0), dl, Ops); } -// We can init constant f16x2 with a single .b32 move. Normally it +// We can init constant f16x2/v2i16/v4i8 with a single .b32 move. Normally it // would get lowered as two constant loads and vector-packing move. -// mov.b16 %h1, 0x4000; -// mov.b16 %h2, 0x3C00; -// mov.b32 %hh2, {%h2, %h1}; // Instead we want just a constant move: -// mov.b32 %hh2, 0x40003C00 -// -// This results in better SASS code with CUDA 7.x. Ptxas in CUDA 8.0 -// generates good SASS in both cases. +// mov.b32 %r2, 0x40003C00 SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op->getValueType(0); - if (!(Isv2x16VT(VT))) + if (!(Isv2x16VT(VT) || VT == MVT::v4i8)) return Op; - APInt E0; - APInt E1; - if (VT == MVT::v2f16 || VT == MVT::v2bf16) { - if (!(isa(Op->getOperand(0)) && - isa(Op->getOperand(1)))) - return Op; - - E0 = cast(Op->getOperand(0)) - ->getValueAPF() - .bitcastToAPInt(); - E1 = cast(Op->getOperand(1)) - ->getValueAPF() - .bitcastToAPInt(); - } else { - assert(VT == MVT::v2i16); - if (!(isa(Op->getOperand(0)) && - isa(Op->getOperand(1)))) - return Op; - E0 = cast(Op->getOperand(0))->getAPIntValue(); - E1 = cast(Op->getOperand(1))->getAPIntValue(); + SDLoc DL(Op); + + if (!llvm::all_of(Op->ops(), [](SDValue Operand) { + return Operand->isUndef() || isa(Operand) || + isa(Operand); + })) { + // Lower non-const v4i8 vector as byte-wise constructed i32, which allows us + // to optimize calculation of constant parts. + if (VT == MVT::v4i8) { + SDValue C8 = DAG.getConstant(8, DL, MVT::i32); + SDValue E01 = DAG.getNode( + NVPTXISD::BFI, DL, MVT::i32, + DAG.getAnyExtOrTrunc(Op->getOperand(1), DL, MVT::i32), + DAG.getAnyExtOrTrunc(Op->getOperand(0), DL, MVT::i32), C8, C8); + SDValue E012 = + DAG.getNode(NVPTXISD::BFI, DL, MVT::i32, + DAG.getAnyExtOrTrunc(Op->getOperand(2), DL, MVT::i32), + E01, DAG.getConstant(16, DL, MVT::i32), C8); + SDValue E0123 = + DAG.getNode(NVPTXISD::BFI, DL, MVT::i32, + DAG.getAnyExtOrTrunc(Op->getOperand(3), DL, MVT::i32), + E012, DAG.getConstant(24, DL, MVT::i32), C8); + return DAG.getNode(ISD::BITCAST, DL, VT, E0123); + } + return Op; } - SDValue Const = - DAG.getConstant(E1.zext(32).shl(16) | E0.zext(32), SDLoc(Op), MVT::i32); + + // Get value or the Nth operand as an APInt(32). Undef values treated as 0. + auto GetOperand = [](SDValue Op, int N) -> APInt { + const SDValue &Operand = Op->getOperand(N); + EVT VT = Op->getValueType(0); + if (Operand->isUndef()) + return APInt(32, 0); + APInt Value; + if (VT == MVT::v2f16 || VT == MVT::v2bf16) + Value = cast(Operand)->getValueAPF().bitcastToAPInt(); + else if (VT == MVT::v2i16 || VT == MVT::v4i8) + Value = cast(Operand)->getAPIntValue(); + else + llvm_unreachable("Unsupported type"); + // i8 values are carried around as i16, so we need to zero out upper bits, + // so they do not get in the way of combining individual byte values + if (VT == MVT::v4i8) + Value = Value.trunc(8); + return Value.zext(32); + }; + APInt Value; + if (Isv2x16VT(VT)) { + Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(16); + } else if (VT == MVT::v4i8) { + Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(8) | + GetOperand(Op, 2).shl(16) | GetOperand(Op, 3).shl(24); + } else { + llvm_unreachable("Unsupported type"); + } + SDValue Const = DAG.getConstant(Value, SDLoc(Op), MVT::i32); return DAG.getNode(ISD::BITCAST, SDLoc(Op), Op->getValueType(0), Const); } SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { SDValue Index = Op->getOperand(1); + SDValue Vector = Op->getOperand(0); + SDLoc DL(Op); + EVT VectorVT = Vector.getValueType(); + + if (VectorVT == MVT::v4i8) { + SDValue BFE = + DAG.getNode(NVPTXISD::BFE, DL, MVT::i32, + {Vector, + DAG.getNode(ISD::MUL, DL, MVT::i32, + DAG.getZExtOrTrunc(Index, DL, MVT::i32), + DAG.getConstant(8, DL, MVT::i32)), + DAG.getConstant(8, DL, MVT::i32)}); + return DAG.getAnyExtOrTrunc(BFE, DL, Op->getValueType(0)); + } + // Constant index will be matched by tablegen. if (isa(Index.getNode())) return Op; // Extract individual elements and select one of them. - SDValue Vector = Op->getOperand(0); - EVT VectorVT = Vector.getValueType(); assert(Isv2x16VT(VectorVT) && "Unexpected vector type."); EVT EltVT = VectorVT.getVectorElementType(); @@ -2214,6 +2283,49 @@ SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, ISD::CondCode::SETEQ); } +SDValue NVPTXTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, + SelectionDAG &DAG) const { + SDValue Vector = Op->getOperand(0); + EVT VectorVT = Vector.getValueType(); + + if (VectorVT != MVT::v4i8) + return Op; + SDLoc DL(Op); + SDValue Value = Op->getOperand(1); + if (Value->isUndef()) + return Vector; + + SDValue Index = Op->getOperand(2); + + SDValue BFI = + DAG.getNode(NVPTXISD::BFI, DL, MVT::i32, + {DAG.getZExtOrTrunc(Value, DL, MVT::i32), Vector, + DAG.getNode(ISD::MUL, DL, MVT::i32, + DAG.getZExtOrTrunc(Index, DL, MVT::i32), + DAG.getConstant(8, DL, MVT::i32)), + DAG.getConstant(8, DL, MVT::i32)}); + return DAG.getNode(ISD::BITCAST, DL, Op->getValueType(0), BFI); +} + +SDValue NVPTXTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, + SelectionDAG &DAG) const { + SDValue V1 = Op.getOperand(0); + EVT VectorVT = V1.getValueType(); + if (VectorVT != MVT::v4i8 || Op.getValueType() != MVT::v4i8) + return Op; + + // Lower shuffle to PRMT instruction. + const ShuffleVectorSDNode *SVN = cast(Op.getNode()); + SDValue V2 = Op.getOperand(1); + uint32_t Selector = 0; + for (auto I : llvm::enumerate(SVN->getMask())) + Selector |= (I.value() << (I.index() * 4)); + + SDLoc DL(Op); + return DAG.getNode(NVPTXISD::PRMT, DL, MVT::v4i8, V1, V2, + DAG.getConstant(Selector, DL, MVT::i32), + DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)); +} /// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift /// amount, or @@ -2464,6 +2576,10 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return Op; case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); + case ISD::INSERT_VECTOR_ELT: + return LowerINSERT_VECTOR_ELT(Op, DAG); + case ISD::VECTOR_SHUFFLE: + return LowerVECTOR_SHUFFLE(Op, DAG); case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); case ISD::STORE: @@ -2578,9 +2694,10 @@ SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { if (Op.getValueType() == MVT::i1) return LowerLOADi1(Op, DAG); - // v2f16/v2bf16/v2i16 are legal, so we can't rely on legalizer to handle + // v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to handle // unaligned loads and have to handle it here. - if (Isv2x16VT(Op.getValueType())) { + EVT VT = Op.getValueType(); + if (Isv2x16VT(VT) || VT == MVT::v4i8) { LoadSDNode *Load = cast(Op); EVT MemVT = Load->getMemoryVT(); if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), @@ -2625,13 +2742,13 @@ SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { // v2f16 is legal, so we can't rely on legalizer to handle unaligned // stores and have to handle it here. - if (Isv2x16VT(VT) && + if ((Isv2x16VT(VT) || VT == MVT::v4i8) && !allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), VT, *Store->getMemOperand())) return expandUnalignedStore(Store, DAG); // v2f16, v2bf16 and v2i16 don't need special handling. - if (Isv2x16VT(VT)) + if (Isv2x16VT(VT) || VT == MVT::v4i8) return SDValue(); if (VT.isVector()) @@ -2903,7 +3020,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( EVT LoadVT = EltVT; if (EltVT == MVT::i1) LoadVT = MVT::i8; - else if (Isv2x16VT(EltVT)) + else if (Isv2x16VT(EltVT) || EltVT == MVT::v4i8) // getLoad needs a vector type, but it can't handle // vectors which contain v2f16 or v2bf16 elements. So we must load // using i32 here and then bitcast back. @@ -2929,7 +3046,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( if (EltVT == MVT::i1) Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt); // v2f16 was loaded as an i32. Now we must bitcast it back. - else if (Isv2x16VT(EltVT)) + else if (EltVT != LoadVT) Elt = DAG.getNode(ISD::BITCAST, dl, EltVT, Elt); // If a promoted integer type is used, truncate down to the original @@ -4975,6 +5092,32 @@ static SDValue PerformANDCombine(SDNode *N, } SDValue AExt; + + // Convert BFE-> truncate i16 -> and 255 + // To just BFE-> truncate i16, as the value already has all the bits in the + // right places. + if (Val.getOpcode() == ISD::TRUNCATE) { + SDValue BFE = Val.getOperand(0); + if (BFE.getOpcode() != NVPTXISD::BFE) + return SDValue(); + + ConstantSDNode *BFEBits = dyn_cast(BFE.getOperand(0)); + if (!BFEBits) + return SDValue(); + uint64_t BFEBitsVal = BFEBits->getZExtValue(); + + ConstantSDNode *MaskCnst = dyn_cast(Mask); + if (!MaskCnst) { + // Not an AND with a constant + return SDValue(); + } + uint64_t MaskVal = MaskCnst->getZExtValue(); + + if (MaskVal != (uint64_t(1) << BFEBitsVal) - 1) + return SDValue(); + // If we get here, the AND is unnecessary. Just replace it with the trunc + DCI.CombineTo(N, Val, false); + } // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and if (Val.getOpcode() == ISD::ANY_EXTEND) { AExt = Val; @@ -5254,13 +5397,15 @@ static SDValue PerformSETCCCombine(SDNode *N, static SDValue PerformEXTRACTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SDValue Vector = N->getOperand(0); + SDLoc DL(N); EVT VectorVT = Vector.getValueType(); if (Vector->getOpcode() == ISD::LOAD && VectorVT.isSimple() && IsPTXVectorType(VectorVT.getSimpleVT())) return SDValue(); // Native vector loads already combine nicely w/ - // extract_vector_elt. + // extract_vector_elt, except for v4i8. // Don't mess with singletons or v2*16 types, we already handle them OK. - if (VectorVT.getVectorNumElements() == 1 || Isv2x16VT(VectorVT)) + if (VectorVT.getVectorNumElements() == 1 || Isv2x16VT(VectorVT) || + VectorVT == MVT::v4i8) return SDValue(); uint64_t VectorBits = VectorVT.getSizeInBits(); @@ -5273,8 +5418,6 @@ static SDValue PerformEXTRACTCombine(SDNode *N, if (!Index || Index->getZExtValue() == 0) return SDValue(); - SDLoc DL(N); - MVT IVT = MVT::getIntegerVT(VectorBits); EVT EltVT = VectorVT.getVectorElementType(); EVT EltIVT = EltVT.changeTypeToInteger(); @@ -5289,9 +5432,45 @@ static SDValue PerformEXTRACTCombine(SDNode *N, // If element has non-integer type, bitcast it back to the expected type. if (EltVT != EltIVT) Result = DCI.DAG.getNode(ISD::BITCAST, DL, EltVT, Result); + // Past legalizer, we may need to extent i8 -> i16 to match the register type. + if (EltVT != N->getValueType(0)) + Result = DCI.DAG.getNode(ISD::ANY_EXTEND, DL, N->getValueType(0), Result); + return Result; } +static SDValue PerformVSELECTCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + SDValue VA = N->getOperand(1); + EVT VectorVT = VA.getValueType(); + if (VectorVT != MVT::v4i8) + return SDValue(); + + // We need to split vselect into individual per-element operations Because we + // use BFE/BFI instruction for byte extraction/insertion, we do end up with + // 32-bit values, so we may as well do comparison as i32 to avoid conversions + // to/from i16 normally used for i8 values. + SmallVector E; + SDLoc DL(N); + SDValue VCond = N->getOperand(0); + SDValue VB = N->getOperand(2); + for (int I = 0; I < 4; ++I) { + SDValue C = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i1, VCond, + DCI.DAG.getConstant(I, DL, MVT::i32)); + SDValue EA = DCI.DAG.getAnyExtOrTrunc( + DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VA, + DCI.DAG.getConstant(I, DL, MVT::i32)), + DL, MVT::i32); + SDValue EB = DCI.DAG.getAnyExtOrTrunc( + DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VB, + DCI.DAG.getConstant(I, DL, MVT::i32)), + DL, MVT::i32); + E.push_back(DCI.DAG.getAnyExtOrTrunc( + DCI.DAG.getNode(ISD::SELECT, DL, MVT::i32, C, EA, EB), DL, MVT::i8)); + } + return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i8, E); +} + SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { CodeGenOptLevel OptLevel = getTargetMachine().getOptLevel(); @@ -5317,6 +5496,8 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, return PerformStoreRetvalCombine(N); case ISD::EXTRACT_VECTOR_ELT: return PerformEXTRACTCombine(N, DCI); + case ISD::VSELECT: + return PerformVSELECTCombine(N, DCI); } return SDValue(); } diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h index cd1985cc4219b..5c7c10965e2f2 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h @@ -57,6 +57,9 @@ enum NodeType : unsigned { MUL_WIDE_UNSIGNED, IMAD, SETP_F16X2, + BFE, + BFI, + PRMT, Dummy, LoadV2 = ISD::FIRST_TARGET_MEMORY_OPCODE, @@ -590,6 +593,8 @@ class NVPTXTargetLowering : public TargetLowering { SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFROUND32(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index df2f706a2ad1f..84ed953ad18a9 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -76,6 +76,10 @@ def CmpLT : PatLeaf<(i32 2)>; def CmpLE : PatLeaf<(i32 3)>; def CmpGT : PatLeaf<(i32 4)>; def CmpGE : PatLeaf<(i32 5)>; +def CmpLO : PatLeaf<(i32 6)>; +def CmpLS : PatLeaf<(i32 7)>; +def CmpHI : PatLeaf<(i32 8)>; +def CmpHS : PatLeaf<(i32 9)>; def CmpEQU : PatLeaf<(i32 10)>; def CmpNEU : PatLeaf<(i32 11)>; def CmpLTU : PatLeaf<(i32 12)>; @@ -107,6 +111,21 @@ def VecElement : Operand { let PrintMethod = "printVecElement"; } +// PRMT modes +// These must match the enum in NVPTX.h +def PrmtNONE : PatLeaf<(i32 0x0)>; +def PrmtF4E : PatLeaf<(i32 0x1)>; +def PrmtB4E : PatLeaf<(i32 0x2)>; +def PrmtRC8 : PatLeaf<(i32 0x3)>; +def PrmtECL : PatLeaf<(i32 0x4)>; +def PrmtECR : PatLeaf<(i32 0x5)>; +def PrmtRC16 : PatLeaf<(i32 0x6)>; + +def PrmtMode : Operand { + let PrintMethod = "printPrmtMode"; +} + + //===----------------------------------------------------------------------===// // NVPTX Instruction Predicate Definitions //===----------------------------------------------------------------------===// @@ -742,7 +761,7 @@ defm SELP_f64 : SELP_PATTERN<"f64", f64, Float64Regs, f64imm, fpimm>; // def v2f16imm : Operand; // defm SELP_f16x2 : SELP_PATTERN<"b32", v2f16, Int32Regs, v2f16imm, imm>; -foreach vt = [v2f16, v2bf16, v2i16] in { +foreach vt = [v2f16, v2bf16, v2i16, v4i8] in { def : Pat<(vt (select Int1Regs:$p, (vt Int32Regs:$a), (vt Int32Regs:$b))), (SELP_b32rr Int32Regs:$a, Int32Regs:$b, Int1Regs:$p)>; } @@ -1486,23 +1505,24 @@ defm OR : BITWISE<"or", or>; defm AND : BITWISE<"and", and>; defm XOR : BITWISE<"xor", xor>; -// Lower logical v2i16 ops as bitwise ops on b32. -def: Pat<(or (v2i16 Int32Regs:$a), (v2i16 Int32Regs:$b)), - (ORb32rr Int32Regs:$a, Int32Regs:$b)>; -def: Pat<(xor (v2i16 Int32Regs:$a), (v2i16 Int32Regs:$b)), - (XORb32rr Int32Regs:$a, Int32Regs:$b)>; -def: Pat<(and (v2i16 Int32Regs:$a), (v2i16 Int32Regs:$b)), - (ANDb32rr Int32Regs:$a, Int32Regs:$b)>; - -// The constants get legalized into a bitcast from i32, so that's what we need -// to match here. -def: Pat<(or Int32Regs:$a, (v2i16 (bitconvert (i32 imm:$b)))), - (ORb32ri Int32Regs:$a, imm:$b)>; -def: Pat<(xor Int32Regs:$a, (v2i16 (bitconvert (i32 imm:$b)))), - (XORb32ri Int32Regs:$a, imm:$b)>; -def: Pat<(and Int32Regs:$a, (v2i16 (bitconvert (i32 imm:$b)))), - (ANDb32ri Int32Regs:$a, imm:$b)>; - +// Lower logical v2i16/v4i8 ops as bitwise ops on b32. +foreach vt = [v2i16, v4i8] in { + def: Pat<(or (vt Int32Regs:$a), (vt Int32Regs:$b)), + (ORb32rr Int32Regs:$a, Int32Regs:$b)>; + def: Pat<(xor (vt Int32Regs:$a), (vt Int32Regs:$b)), + (XORb32rr Int32Regs:$a, Int32Regs:$b)>; + def: Pat<(and (vt Int32Regs:$a), (vt Int32Regs:$b)), + (ANDb32rr Int32Regs:$a, Int32Regs:$b)>; + + // The constants get legalized into a bitcast from i32, so that's what we need + // to match here. + def: Pat<(or Int32Regs:$a, (vt (bitconvert (i32 imm:$b)))), + (ORb32ri Int32Regs:$a, imm:$b)>; + def: Pat<(xor Int32Regs:$a, (vt (bitconvert (i32 imm:$b)))), + (XORb32ri Int32Regs:$a, imm:$b)>; + def: Pat<(and Int32Regs:$a, (vt (bitconvert (i32 imm:$b)))), + (ANDb32ri Int32Regs:$a, imm:$b)>; +} def NOT1 : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$src), "not.pred \t$dst, $src;", @@ -1737,34 +1757,119 @@ def FUNSHFRCLAMP : // restriction in PTX? // // dest and src may be int32 or int64, but start and end are always int32. -multiclass BFX { +def SDTBFE : + SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>, + SDTCisVT<2, i32>, SDTCisVT<3, i32>]>; +def bfe : SDNode<"NVPTXISD::BFE", SDTBFE>; + +def SDTBFI : + SDTypeProfile<1, 4, [SDTCisInt<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, + SDTCisVT<3, i32>, SDTCisVT<4, i32>]>; +def bfi : SDNode<"NVPTXISD::BFI", SDTBFI>; + +def SDTPRMT : + SDTypeProfile<1, 4, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, + SDTCisVT<2, i32>, SDTCisVT<3, i32>, SDTCisVT<4, i32>,]>; +def prmt : SDNode<"NVPTXISD::PRMT", SDTPRMT>; + +multiclass BFE { def rrr : NVPTXInst<(outs RC:$d), (ins RC:$a, Int32Regs:$b, Int32Regs:$c), - !strconcat(Instr, " \t$d, $a, $b, $c;"), []>; + !strconcat(Instr, " \t$d, $a, $b, $c;"), + [(set (T RC:$d), (bfe (T RC:$a), (i32 Int32Regs:$b), (i32 Int32Regs:$c)))]>; def rri : NVPTXInst<(outs RC:$d), (ins RC:$a, Int32Regs:$b, i32imm:$c), - !strconcat(Instr, " \t$d, $a, $b, $c;"), []>; + !strconcat(Instr, " \t$d, $a, $b, $c;"), + [(set (T RC:$d), (bfe (T RC:$a), (i32 Int32Regs:$b), (i32 imm:$c)))]>; def rii : NVPTXInst<(outs RC:$d), (ins RC:$a, i32imm:$b, i32imm:$c), - !strconcat(Instr, " \t$d, $a, $b, $c;"), []>; + !strconcat(Instr, " \t$d, $a, $b, $c;"), + [(set (T RC:$d), (bfe (T RC:$a), (i32 imm:$b), (i32 imm:$c)))]>; +} + +multiclass BFI { + def rrrr + : NVPTXInst<(outs RC:$f), + (ins RC:$a, RC:$b, Int32Regs:$c, Int32Regs:$d), + !strconcat(Instr, " \t$f, $a, $b, $c, $d;"), + [(set (T RC:$f), (bfi (T RC:$a), (T RC:$b), (i32 Int32Regs:$c), (i32 Int32Regs:$d)))]>; + def rrri + : NVPTXInst<(outs RC:$f), + (ins RC:$a, RC:$b, Int32Regs:$c, i32imm:$d), + !strconcat(Instr, " \t$f, $a, $b, $c, $d;"), + [(set (T RC:$f), (bfi (T RC:$a), (T RC:$b), (i32 Int32Regs:$c), (i32 imm:$d)))]>; + def rrii + : NVPTXInst<(outs RC:$f), + (ins RC:$a, RC:$b, i32imm:$c, i32imm:$d), + !strconcat(Instr, " \t$f, $a, $b, $c, $d;"), + [(set (T RC:$f), (bfi (T RC:$a), (T RC:$b), (i32 imm:$c), (i32 imm:$d)))]>; + def irrr + : NVPTXInst<(outs RC:$f), + (ins ImmCls:$a, RC:$b, Int32Regs:$c, Int32Regs:$d), + !strconcat(Instr, " \t$f, $a, $b, $c, $d;"), + [(set (T RC:$f), (bfi (T imm:$a), (T RC:$b), (i32 Int32Regs:$c), (i32 Int32Regs:$d)))]>; + def irri + : NVPTXInst<(outs RC:$f), + (ins ImmCls:$a, RC:$b, Int32Regs:$c, i32imm:$d), + !strconcat(Instr, " \t$f, $a, $b, $c, $d;"), + [(set (T RC:$f), (bfi (T imm:$a), (T RC:$b), (i32 Int32Regs:$c), (i32 imm:$d)))]>; + def irii + : NVPTXInst<(outs RC:$f), + (ins ImmCls:$a, RC:$b, i32imm:$c, i32imm:$d), + !strconcat(Instr, " \t$f, $a, $b, $c, $d;"), + [(set (T RC:$f), (bfi (T imm:$a), (T RC:$b), (i32 imm:$c), (i32 imm:$d)))]>; +} + +multiclass PRMT { + def rrr + : NVPTXInst<(outs RC:$d), + (ins RC:$a, Int32Regs:$b, Int32Regs:$c, PrmtMode:$mode), + !strconcat("prmt.b32${mode}", " \t$d, $a, $b, $c;"), + [(set (T RC:$d), (prmt (T RC:$a), (T RC:$b), (i32 Int32Regs:$c), imm:$mode))]>; + def rri + : NVPTXInst<(outs RC:$d), + (ins RC:$a, Int32Regs:$b, i32imm:$c, PrmtMode:$mode), + !strconcat("prmt.b32${mode}", " \t$d, $a, $b, $c;"), + [(set (T RC:$d), (prmt (T RC:$a), (T RC:$b), (i32 imm:$c), imm:$mode))]>; + def rii + : NVPTXInst<(outs RC:$d), + (ins RC:$a, i32imm:$b, i32imm:$c, PrmtMode:$mode), + !strconcat("prmt.b32${mode}", " \t$d, $a, $b, $c;"), + [(set (T RC:$d), (prmt (T RC:$a), (T imm:$b), (i32 imm:$c), imm:$mode))]>; } let hasSideEffects = false in { - defm BFE_S32 : BFX<"bfe.s32", Int32Regs>; - defm BFE_U32 : BFX<"bfe.u32", Int32Regs>; - defm BFE_S64 : BFX<"bfe.s64", Int64Regs>; - defm BFE_U64 : BFX<"bfe.u64", Int64Regs>; + defm BFE_S32 : BFE<"bfe.s32", i32, Int32Regs>; + defm BFE_U32 : BFE<"bfe.u32", i32, Int32Regs>; + defm BFE_S64 : BFE<"bfe.s64", i64, Int64Regs>; + defm BFE_U64 : BFE<"bfe.u64", i64, Int64Regs>; + + defm BFI_B32 : BFI<"bfi.b32", i32, Int32Regs, i32imm>; + defm BFI_B64 : BFI<"bfi.b64", i64, Int64Regs, i64imm>; - defm BFI_S32 : BFX<"bfi.s32", Int32Regs>; - defm BFI_U32 : BFX<"bfi.u32", Int32Regs>; - defm BFI_S64 : BFX<"bfi.s64", Int64Regs>; - defm BFI_U64 : BFX<"bfi.u64", Int64Regs>; + defm PRMT_B32 : PRMT; } -// Common byte extraction patterns + +// byte extraction + signed/unsigned extension to i32. +def : Pat<(i32 (sext_inreg (bfe (i32 Int32Regs:$s), (i32 Int32Regs:$o), 8), i8)), + (BFE_S32rri Int32Regs:$s, Int32Regs:$o, 8)>; +def : Pat<(i32 (sext_inreg (bfe (i32 Int32Regs:$s), (i32 imm:$o), 8), i8)), + (BFE_S32rii Int32Regs:$s, imm:$o, 8)>; +def : Pat<(i32 (and (bfe (i32 Int32Regs:$s), (i32 Int32Regs:$o), 8), 255)), + (BFE_U32rri Int32Regs:$s, Int32Regs:$o, 8)>; +def : Pat<(i32 (and (bfe (i32 Int32Regs:$s), (i32 imm:$o), 8), 255)), + (BFE_U32rii Int32Regs:$s, imm:$o, 8)>; + +// byte extraction + signed extension to i16 +def : Pat<(i16 (sext_inreg (trunc (bfe (i32 Int32Regs:$s), (i32 imm:$o), 8)), i8)), + (CVT_s8_s32 (BFE_S32rii Int32Regs:$s, imm:$o, 8), CvtNONE)>; + + +// Byte extraction via shift/trunc/sext def : Pat<(i16 (sext_inreg (trunc Int32Regs:$s), i8)), (CVT_s8_s32 Int32Regs:$s, CvtNONE)>; def : Pat<(i16 (sext_inreg (trunc (srl (i32 Int32Regs:$s), (i32 imm:$o))), i8)), @@ -1773,7 +1878,6 @@ def : Pat<(sext_inreg (srl (i32 Int32Regs:$s), (i32 imm:$o)), i8), (BFE_S32rii Int32Regs:$s, imm:$o, 8)>; def : Pat<(i16 (sra (i16 (trunc Int32Regs:$s)), (i32 8))), (CVT_s8_s32 (BFE_S32rii Int32Regs:$s, 8, 8), CvtNONE)>; - def : Pat<(sext_inreg (srl (i64 Int64Regs:$s), (i32 imm:$o)), i8), (BFE_S64rii Int64Regs:$s, imm:$o, 8)>; def : Pat<(i16 (sext_inreg (trunc Int64Regs:$s), i8)), @@ -2110,6 +2214,29 @@ def : Pat<(seteq Int1Regs:$a, Int1Regs:$b), def : Pat<(setueq Int1Regs:$a, Int1Regs:$b), (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>; +// comparisons of i8 extracted with BFE as i32 +def: Pat<(setgt (sext_inreg (trunc Int32Regs:$a), i8), (sext_inreg (trunc Int32Regs:$b), i8)), + (SETP_s32rr Int32Regs:$a, Int32Regs:$b, CmpGT)>; +def: Pat<(setge (sext_inreg (trunc Int32Regs:$a), i8), (sext_inreg (trunc Int32Regs:$b), i8)), + (SETP_s32rr Int32Regs:$a, Int32Regs:$b, CmpGE)>; +def: Pat<(setlt (sext_inreg (trunc Int32Regs:$a), i8), (sext_inreg (trunc Int32Regs:$b), i8)), + (SETP_s32rr Int32Regs:$a, Int32Regs:$b, CmpLT)>; +def: Pat<(setle (sext_inreg (trunc Int32Regs:$a), i8), (sext_inreg (trunc Int32Regs:$b), i8)), + (SETP_s32rr Int32Regs:$a, Int32Regs:$b, CmpLE)>; + +def: Pat<(setugt (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))), + (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpHI)>; +def: Pat<(setuge (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))), + (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpHS)>; +def: Pat<(setult (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))), + (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpLO)>; +def: Pat<(setule (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))), + (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpLS)>; +def: Pat<(seteq (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))), + (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpEQ)>; +def: Pat<(setne (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))), + (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpNE)>; + // i1 compare -> i32 def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)), (SELP_u32ii -1, 0, (XORb1rr Int1Regs:$a, Int1Regs:$b))>; @@ -2682,7 +2809,7 @@ foreach vt = [f16, bf16] in { def: Pat<(vt (ProxyReg vt:$src)), (ProxyRegI16 Int16Regs:$src)>; } -foreach vt = [v2f16, v2bf16, v2i16] in { +foreach vt = [v2f16, v2bf16, v2i16, v4i8] in { def: Pat<(vt (ProxyReg vt:$src)), (ProxyRegI32 Int32Regs:$src)>; } @@ -2995,8 +3122,8 @@ def: Pat<(i16 (bitconvert (vt Int16Regs:$a))), (ProxyRegI16 Int16Regs:$a)>; } -foreach ta = [v2f16, v2bf16, v2i16, i32] in { - foreach tb = [v2f16, v2bf16, v2i16, i32] in { +foreach ta = [v2f16, v2bf16, v2i16, v4i8, i32] in { + foreach tb = [v2f16, v2bf16, v2i16, v4i8, i32] in { if !ne(ta, tb) then { def: Pat<(ta (bitconvert (tb Int32Regs:$a))), (ProxyRegI32 Int32Regs:$a)>; @@ -3342,6 +3469,9 @@ def : Pat<(i32 (trunc (srl Int64Regs:$s, (i32 32)))), def : Pat<(i32 (trunc (sra Int64Regs:$s, (i32 32)))), (I64toI32H Int64Regs:$s)>; +def: Pat<(i32 (sext (extractelt (v2i16 Int32Regs:$src), 0))), + (CVT_INREG_s32_s16 Int32Regs:$src)>; + foreach vt = [v2f16, v2bf16, v2i16] in { def : Pat<(extractelt (vt Int32Regs:$src), 0), (I32toI16L Int32Regs:$src)>; diff --git a/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td b/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td index ed9dabf39dd7a..b5231a9cf67f9 100644 --- a/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td @@ -58,7 +58,7 @@ foreach i = 0...31 in { //===----------------------------------------------------------------------===// def Int1Regs : NVPTXRegClass<[i1], 8, (add (sequence "P%u", 0, 4))>; def Int16Regs : NVPTXRegClass<[i16, f16, bf16], 16, (add (sequence "RS%u", 0, 4))>; -def Int32Regs : NVPTXRegClass<[i32, v2f16, v2bf16, v2i16], 32, +def Int32Regs : NVPTXRegClass<[i32, v2f16, v2bf16, v2i16, v4i8], 32, (add (sequence "R%u", 0, 4), VRFrame32, VRFrameLocal32)>; def Int64Regs : NVPTXRegClass<[i64], 64, (add (sequence "RL%u", 0, 4), VRFrame64, VRFrameLocal64)>; diff --git a/llvm/test/CodeGen/NVPTX/extractelement.ll b/llvm/test/CodeGen/NVPTX/extractelement.ll index da07f973501c8..ed7dd45ab7b45 100644 --- a/llvm/test/CodeGen/NVPTX/extractelement.ll +++ b/llvm/test/CodeGen/NVPTX/extractelement.ll @@ -18,7 +18,8 @@ define i16 @test_v2i8(i16 %a) { ; CHECK-LABEL: test_v4i8 ; CHECK: ld.param.u32 [[R:%r[0-9+]]], [test_v4i8_param_0]; -; CHECK-DAG: cvt.s8.s32 [[E0:%rs[0-9+]]], [[R]]; +; CHECK-DAG: bfe.s32 [[R0:%r[0-9+]]], [[R]], 0, 8; +; CHECK-DAG: cvt.s8.s32 [[E0:%rs[0-9+]]], [[R0]]; ; CHECK-DAG: bfe.s32 [[R1:%r[0-9+]]], [[R]], 8, 8; ; CHECK-DAG: cvt.s8.s32 [[E1:%rs[0-9+]]], [[R1]]; ; CHECK-DAG: bfe.s32 [[R2:%r[0-9+]]], [[R]], 16, 8; @@ -41,6 +42,58 @@ define i16 @test_v4i8(i32 %a) { ret i16 %r } +; CHECK-LABEL: test_v4i8_s32 +; CHECK: ld.param.u32 [[R:%r[0-9+]]], [test_v4i8_s32_param_0]; +; CHECK-DAG: bfe.s32 [[R0:%r[0-9+]]], [[R]], 0, 8; +; CHECK-DAG: bfe.s32 [[R1:%r[0-9+]]], [[R]], 8, 8; +; CHECK-DAG: bfe.s32 [[R2:%r[0-9+]]], [[R]], 16, 8; +; CHECK-DAG: bfe.s32 [[R3:%r[0-9+]]], [[R]], 24, 8; +; CHECK-DAG: add.s32 [[R01:%r[0-9+]]], [[R0]], [[R1]] +; CHECK-DAG: add.s32 [[R23:%r[0-9+]]], [[R2]], [[R3]] +; CHECK-DAG: add.s32 [[R0123:%r[0-9+]]], [[R01]], [[R23]] +define i32 @test_v4i8_s32(i32 %a) { + %v = bitcast i32 %a to <4 x i8> + %r0 = extractelement <4 x i8> %v, i64 0 + %r1 = extractelement <4 x i8> %v, i64 1 + %r2 = extractelement <4 x i8> %v, i64 2 + %r3 = extractelement <4 x i8> %v, i64 3 + %r0i = sext i8 %r0 to i32 + %r1i = sext i8 %r1 to i32 + %r2i = sext i8 %r2 to i32 + %r3i = sext i8 %r3 to i32 + %r01 = add i32 %r0i, %r1i + %r23 = add i32 %r2i, %r3i + %r = add i32 %r01, %r23 + ret i32 %r +} + +; CHECK-LABEL: test_v4i8_u32 +; CHECK: ld.param.u32 [[R:%r[0-9+]]], [test_v4i8_u32_param_0]; +; CHECK-DAG: bfe.u32 [[R0:%r[0-9+]]], [[R]], 0, 8; +; CHECK-DAG: bfe.u32 [[R1:%r[0-9+]]], [[R]], 8, 8; +; CHECK-DAG: bfe.u32 [[R2:%r[0-9+]]], [[R]], 16, 8; +; CHECK-DAG: bfe.u32 [[R3:%r[0-9+]]], [[R]], 24, 8; +; CHECK-DAG: add.s32 [[R01:%r[0-9+]]], [[R0]], [[R1]] +; CHECK-DAG: add.s32 [[R23:%r[0-9+]]], [[R2]], [[R3]] +; CHECK-DAG: add.s32 [[R0123:%r[0-9+]]], [[R01]], [[R23]] +define i32 @test_v4i8_u32(i32 %a) { + %v = bitcast i32 %a to <4 x i8> + %r0 = extractelement <4 x i8> %v, i64 0 + %r1 = extractelement <4 x i8> %v, i64 1 + %r2 = extractelement <4 x i8> %v, i64 2 + %r3 = extractelement <4 x i8> %v, i64 3 + %r0i = zext i8 %r0 to i32 + %r1i = zext i8 %r1 to i32 + %r2i = zext i8 %r2 to i32 + %r3i = zext i8 %r3 to i32 + %r01 = add i32 %r0i, %r1i + %r23 = add i32 %r2i, %r3i + %r = add i32 %r01, %r23 + ret i32 %r +} + + + ; CHECK-LABEL: test_v8i8 ; CHECK: ld.param.u64 [[R:%rd[0-9+]]], [test_v8i8_param_0]; ; CHECK-DAG: cvt.s8.s64 [[E0:%rs[0-9+]]], [[R]]; diff --git a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll index 5a22bbcf7416c..684e4bc38d83d 100644 --- a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll @@ -5,7 +5,7 @@ ; RUN: %if ptxas %{ \ ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90 -asm-verbose=false \ ; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ -; RUN: | %ptxas-verify -arch=sm_53 \ +; RUN: | %ptxas-verify -arch=sm_90 \ ; RUN: %} ; ## No support for i16x2 instructions ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \ diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll new file mode 100644 index 0000000000000..fd48313ad6848 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll @@ -0,0 +1,1272 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; ## Support i16x2 instructions +; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90 -mattr=+ptx80 \ +; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ +; RUN: | FileCheck -allow-deprecated-dag-overlap %s +; RUN: %if ptxas %{ \ +; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90 \ +; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ +; RUN: | %ptxas-verify -arch=sm_90 \ +; RUN: %} + +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" + +define <4 x i8> @test_ret_const() #0 { +; CHECK-LABEL: test_ret_const( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: mov.u32 %r1, -66911489; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1; +; CHECK-NEXT: ret; + ret <4 x i8> +} + +define i8 @test_extract_0(<4 x i8> %a) #0 { +; CHECK-LABEL: test_extract_0( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_extract_0_param_0]; +; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2; +; CHECK-NEXT: ret; + %e = extractelement <4 x i8> %a, i32 0 + ret i8 %e +} + +define i8 @test_extract_1(<4 x i8> %a) #0 { +; CHECK-LABEL: test_extract_1( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_extract_1_param_0]; +; CHECK-NEXT: bfe.u32 %r2, %r1, 8, 8; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2; +; CHECK-NEXT: ret; + %e = extractelement <4 x i8> %a, i32 1 + ret i8 %e +} + +define i8 @test_extract_2(<4 x i8> %a) #0 { +; CHECK-LABEL: test_extract_2( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_extract_2_param_0]; +; CHECK-NEXT: bfe.u32 %r2, %r1, 16, 8; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2; +; CHECK-NEXT: ret; + %e = extractelement <4 x i8> %a, i32 2 + ret i8 %e +} + +define i8 @test_extract_3(<4 x i8> %a) #0 { +; CHECK-LABEL: test_extract_3( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_extract_3_param_0]; +; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2; +; CHECK-NEXT: ret; + %e = extractelement <4 x i8> %a, i32 3 + ret i8 %e +} + +define i8 @test_extract_i(<4 x i8> %a, i64 %idx) #0 { +; CHECK-LABEL: test_extract_i( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [test_extract_i_param_1]; +; CHECK-NEXT: ld.param.u32 %r1, [test_extract_i_param_0]; +; CHECK-NEXT: cvt.u32.u64 %r2, %rd1; +; CHECK-NEXT: shl.b32 %r3, %r2, 3; +; CHECK-NEXT: bfe.u32 %r4, %r1, %r3, 8; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r4; +; CHECK-NEXT: ret; + %e = extractelement <4 x i8> %a, i64 %idx + ret i8 %e +} + +define <4 x i8> @test_add(<4 x i8> %a, <4 x i8> %b) #0 { +; CHECK-LABEL: test_add( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<13>; +; CHECK-NEXT: .reg .b32 %r<19>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r2, [test_add_param_1]; +; CHECK-NEXT: ld.param.u32 %r1, [test_add_param_0]; +; CHECK-NEXT: bfe.s32 %r3, %r2, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; +; CHECK-NEXT: bfe.s32 %r4, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs2, %r4; +; CHECK-NEXT: add.s16 %rs3, %rs2, %rs1; +; CHECK-NEXT: cvt.u32.u16 %r5, %rs3; +; CHECK-NEXT: bfe.s32 %r6, %r2, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs4, %r6; +; CHECK-NEXT: bfe.s32 %r7, %r1, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; +; CHECK-NEXT: add.s16 %rs6, %rs5, %rs4; +; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; +; CHECK-NEXT: bfi.b32 %r9, %r8, %r5, 8, 8; +; CHECK-NEXT: bfe.s32 %r10, %r2, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; +; CHECK-NEXT: bfe.s32 %r11, %r1, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs8, %r11; +; CHECK-NEXT: add.s16 %rs9, %rs8, %rs7; +; CHECK-NEXT: cvt.u32.u16 %r12, %rs9; +; CHECK-NEXT: bfi.b32 %r13, %r12, %r9, 16, 8; +; CHECK-NEXT: bfe.s32 %r14, %r2, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs10, %r14; +; CHECK-NEXT: bfe.s32 %r15, %r1, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs11, %r15; +; CHECK-NEXT: add.s16 %rs12, %rs11, %rs10; +; CHECK-NEXT: cvt.u32.u16 %r16, %rs12; +; CHECK-NEXT: bfi.b32 %r17, %r16, %r13, 24, 8; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r17; +; CHECK-NEXT: ret; + %r = add <4 x i8> %a, %b + ret <4 x i8> %r +} + +define <4 x i8> @test_add_imm_0(<4 x i8> %a) #0 { +; CHECK-LABEL: test_add_imm_0( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<9>; +; CHECK-NEXT: .reg .b32 %r<14>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_add_imm_0_param_0]; +; CHECK-NEXT: bfe.s32 %r2, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; +; CHECK-NEXT: bfe.s32 %r4, %r1, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; +; CHECK-NEXT: add.s16 %rs4, %rs3, 2; +; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; +; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8; +; CHECK-NEXT: bfe.s32 %r7, %r1, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; +; CHECK-NEXT: add.s16 %rs6, %rs5, 3; +; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; +; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8; +; CHECK-NEXT: bfe.s32 %r10, %r1, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; +; CHECK-NEXT: add.s16 %rs8, %rs7, 4; +; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; +; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r12; +; CHECK-NEXT: ret; + %r = add <4 x i8> , %a + ret <4 x i8> %r +} + +define <4 x i8> @test_add_imm_1(<4 x i8> %a) #0 { +; CHECK-LABEL: test_add_imm_1( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<9>; +; CHECK-NEXT: .reg .b32 %r<14>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_add_imm_1_param_0]; +; CHECK-NEXT: bfe.s32 %r2, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; +; CHECK-NEXT: bfe.s32 %r4, %r1, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; +; CHECK-NEXT: add.s16 %rs4, %rs3, 2; +; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; +; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8; +; CHECK-NEXT: bfe.s32 %r7, %r1, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; +; CHECK-NEXT: add.s16 %rs6, %rs5, 3; +; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; +; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8; +; CHECK-NEXT: bfe.s32 %r10, %r1, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; +; CHECK-NEXT: add.s16 %rs8, %rs7, 4; +; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; +; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r12; +; CHECK-NEXT: ret; + %r = add <4 x i8> %a, + ret <4 x i8> %r +} + +define <4 x i8> @test_sub(<4 x i8> %a, <4 x i8> %b) #0 { +; CHECK-LABEL: test_sub( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<13>; +; CHECK-NEXT: .reg .b32 %r<19>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r2, [test_sub_param_1]; +; CHECK-NEXT: ld.param.u32 %r1, [test_sub_param_0]; +; CHECK-NEXT: bfe.s32 %r3, %r2, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; +; CHECK-NEXT: bfe.s32 %r4, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs2, %r4; +; CHECK-NEXT: sub.s16 %rs3, %rs2, %rs1; +; CHECK-NEXT: cvt.u32.u16 %r5, %rs3; +; CHECK-NEXT: bfe.s32 %r6, %r2, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs4, %r6; +; CHECK-NEXT: bfe.s32 %r7, %r1, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; +; CHECK-NEXT: sub.s16 %rs6, %rs5, %rs4; +; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; +; CHECK-NEXT: bfi.b32 %r9, %r8, %r5, 8, 8; +; CHECK-NEXT: bfe.s32 %r10, %r2, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; +; CHECK-NEXT: bfe.s32 %r11, %r1, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs8, %r11; +; CHECK-NEXT: sub.s16 %rs9, %rs8, %rs7; +; CHECK-NEXT: cvt.u32.u16 %r12, %rs9; +; CHECK-NEXT: bfi.b32 %r13, %r12, %r9, 16, 8; +; CHECK-NEXT: bfe.s32 %r14, %r2, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs10, %r14; +; CHECK-NEXT: bfe.s32 %r15, %r1, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs11, %r15; +; CHECK-NEXT: sub.s16 %rs12, %rs11, %rs10; +; CHECK-NEXT: cvt.u32.u16 %r16, %rs12; +; CHECK-NEXT: bfi.b32 %r17, %r16, %r13, 24, 8; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r17; +; CHECK-NEXT: ret; + %r = sub <4 x i8> %a, %b + ret <4 x i8> %r +} + +define <4 x i8> @test_smax(<4 x i8> %a, <4 x i8> %b) #0 { +; CHECK-LABEL: test_smax( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<5>; +; CHECK-NEXT: .reg .b32 %r<19>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r2, [test_smax_param_1]; +; CHECK-NEXT: ld.param.u32 %r1, [test_smax_param_0]; +; CHECK-NEXT: bfe.s32 %r3, %r1, 24, 8; +; CHECK-NEXT: bfe.s32 %r4, %r2, 24, 8; +; CHECK-NEXT: setp.gt.s32 %p1, %r3, %r4; +; CHECK-NEXT: bfe.s32 %r5, %r1, 16, 8; +; CHECK-NEXT: bfe.s32 %r6, %r2, 16, 8; +; CHECK-NEXT: setp.gt.s32 %p2, %r5, %r6; +; CHECK-NEXT: bfe.s32 %r7, %r1, 8, 8; +; CHECK-NEXT: bfe.s32 %r8, %r2, 8, 8; +; CHECK-NEXT: setp.gt.s32 %p3, %r7, %r8; +; CHECK-NEXT: bfe.s32 %r9, %r1, 0, 8; +; CHECK-NEXT: bfe.s32 %r10, %r2, 0, 8; +; CHECK-NEXT: setp.gt.s32 %p4, %r9, %r10; +; CHECK-NEXT: selp.b32 %r11, %r9, %r10, %p4; +; CHECK-NEXT: selp.b32 %r12, %r7, %r8, %p3; +; CHECK-NEXT: bfi.b32 %r13, %r12, %r11, 8, 8; +; CHECK-NEXT: selp.b32 %r14, %r5, %r6, %p2; +; CHECK-NEXT: bfi.b32 %r15, %r14, %r13, 16, 8; +; CHECK-NEXT: selp.b32 %r16, %r3, %r4, %p1; +; CHECK-NEXT: bfi.b32 %r17, %r16, %r15, 24, 8; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r17; +; CHECK-NEXT: ret; + %cmp = icmp sgt <4 x i8> %a, %b + %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b + ret <4 x i8> %r +} + +define <4 x i8> @test_umax(<4 x i8> %a, <4 x i8> %b) #0 { +; CHECK-LABEL: test_umax( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<5>; +; CHECK-NEXT: .reg .b32 %r<19>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r2, [test_umax_param_1]; +; CHECK-NEXT: ld.param.u32 %r1, [test_umax_param_0]; +; CHECK-NEXT: bfe.s32 %r3, %r1, 24, 8; +; CHECK-NEXT: bfe.s32 %r4, %r2, 24, 8; +; CHECK-NEXT: setp.hi.u32 %p1, %r3, %r4; +; CHECK-NEXT: bfe.s32 %r5, %r1, 16, 8; +; CHECK-NEXT: bfe.s32 %r6, %r2, 16, 8; +; CHECK-NEXT: setp.hi.u32 %p2, %r5, %r6; +; CHECK-NEXT: bfe.s32 %r7, %r1, 8, 8; +; CHECK-NEXT: bfe.s32 %r8, %r2, 8, 8; +; CHECK-NEXT: setp.hi.u32 %p3, %r7, %r8; +; CHECK-NEXT: bfe.s32 %r9, %r1, 0, 8; +; CHECK-NEXT: bfe.s32 %r10, %r2, 0, 8; +; CHECK-NEXT: setp.hi.u32 %p4, %r9, %r10; +; CHECK-NEXT: selp.b32 %r11, %r9, %r10, %p4; +; CHECK-NEXT: selp.b32 %r12, %r7, %r8, %p3; +; CHECK-NEXT: bfi.b32 %r13, %r12, %r11, 8, 8; +; CHECK-NEXT: selp.b32 %r14, %r5, %r6, %p2; +; CHECK-NEXT: bfi.b32 %r15, %r14, %r13, 16, 8; +; CHECK-NEXT: selp.b32 %r16, %r3, %r4, %p1; +; CHECK-NEXT: bfi.b32 %r17, %r16, %r15, 24, 8; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r17; +; CHECK-NEXT: ret; + %cmp = icmp ugt <4 x i8> %a, %b + %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b + ret <4 x i8> %r +} + +define <4 x i8> @test_smin(<4 x i8> %a, <4 x i8> %b) #0 { +; CHECK-LABEL: test_smin( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<5>; +; CHECK-NEXT: .reg .b32 %r<19>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r2, [test_smin_param_1]; +; CHECK-NEXT: ld.param.u32 %r1, [test_smin_param_0]; +; CHECK-NEXT: bfe.s32 %r3, %r1, 24, 8; +; CHECK-NEXT: bfe.s32 %r4, %r2, 24, 8; +; CHECK-NEXT: setp.le.s32 %p1, %r3, %r4; +; CHECK-NEXT: bfe.s32 %r5, %r1, 16, 8; +; CHECK-NEXT: bfe.s32 %r6, %r2, 16, 8; +; CHECK-NEXT: setp.le.s32 %p2, %r5, %r6; +; CHECK-NEXT: bfe.s32 %r7, %r1, 8, 8; +; CHECK-NEXT: bfe.s32 %r8, %r2, 8, 8; +; CHECK-NEXT: setp.le.s32 %p3, %r7, %r8; +; CHECK-NEXT: bfe.s32 %r9, %r1, 0, 8; +; CHECK-NEXT: bfe.s32 %r10, %r2, 0, 8; +; CHECK-NEXT: setp.le.s32 %p4, %r9, %r10; +; CHECK-NEXT: selp.b32 %r11, %r9, %r10, %p4; +; CHECK-NEXT: selp.b32 %r12, %r7, %r8, %p3; +; CHECK-NEXT: bfi.b32 %r13, %r12, %r11, 8, 8; +; CHECK-NEXT: selp.b32 %r14, %r5, %r6, %p2; +; CHECK-NEXT: bfi.b32 %r15, %r14, %r13, 16, 8; +; CHECK-NEXT: selp.b32 %r16, %r3, %r4, %p1; +; CHECK-NEXT: bfi.b32 %r17, %r16, %r15, 24, 8; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r17; +; CHECK-NEXT: ret; + %cmp = icmp sle <4 x i8> %a, %b + %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b + ret <4 x i8> %r +} + +define <4 x i8> @test_umin(<4 x i8> %a, <4 x i8> %b) #0 { +; CHECK-LABEL: test_umin( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<5>; +; CHECK-NEXT: .reg .b32 %r<19>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r2, [test_umin_param_1]; +; CHECK-NEXT: ld.param.u32 %r1, [test_umin_param_0]; +; CHECK-NEXT: bfe.s32 %r3, %r1, 24, 8; +; CHECK-NEXT: bfe.s32 %r4, %r2, 24, 8; +; CHECK-NEXT: setp.ls.u32 %p1, %r3, %r4; +; CHECK-NEXT: bfe.s32 %r5, %r1, 16, 8; +; CHECK-NEXT: bfe.s32 %r6, %r2, 16, 8; +; CHECK-NEXT: setp.ls.u32 %p2, %r5, %r6; +; CHECK-NEXT: bfe.s32 %r7, %r1, 8, 8; +; CHECK-NEXT: bfe.s32 %r8, %r2, 8, 8; +; CHECK-NEXT: setp.ls.u32 %p3, %r7, %r8; +; CHECK-NEXT: bfe.s32 %r9, %r1, 0, 8; +; CHECK-NEXT: bfe.s32 %r10, %r2, 0, 8; +; CHECK-NEXT: setp.ls.u32 %p4, %r9, %r10; +; CHECK-NEXT: selp.b32 %r11, %r9, %r10, %p4; +; CHECK-NEXT: selp.b32 %r12, %r7, %r8, %p3; +; CHECK-NEXT: bfi.b32 %r13, %r12, %r11, 8, 8; +; CHECK-NEXT: selp.b32 %r14, %r5, %r6, %p2; +; CHECK-NEXT: bfi.b32 %r15, %r14, %r13, 16, 8; +; CHECK-NEXT: selp.b32 %r16, %r3, %r4, %p1; +; CHECK-NEXT: bfi.b32 %r17, %r16, %r15, 24, 8; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r17; +; CHECK-NEXT: ret; + %cmp = icmp ule <4 x i8> %a, %b + %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b + ret <4 x i8> %r +} + +define <4 x i8> @test_eq(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) #0 { +; CHECK-LABEL: test_eq( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<5>; +; CHECK-NEXT: .reg .b32 %r<24>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r3, [test_eq_param_2]; +; CHECK-NEXT: ld.param.u32 %r2, [test_eq_param_1]; +; CHECK-NEXT: ld.param.u32 %r1, [test_eq_param_0]; +; CHECK-NEXT: bfe.s32 %r4, %r2, 24, 8; +; CHECK-NEXT: bfe.s32 %r5, %r1, 24, 8; +; CHECK-NEXT: setp.eq.u32 %p1, %r5, %r4; +; CHECK-NEXT: bfe.s32 %r6, %r2, 16, 8; +; CHECK-NEXT: bfe.s32 %r7, %r1, 16, 8; +; CHECK-NEXT: setp.eq.u32 %p2, %r7, %r6; +; CHECK-NEXT: bfe.s32 %r8, %r2, 8, 8; +; CHECK-NEXT: bfe.s32 %r9, %r1, 8, 8; +; CHECK-NEXT: setp.eq.u32 %p3, %r9, %r8; +; CHECK-NEXT: bfe.s32 %r10, %r2, 0, 8; +; CHECK-NEXT: bfe.s32 %r11, %r1, 0, 8; +; CHECK-NEXT: setp.eq.u32 %p4, %r11, %r10; +; CHECK-NEXT: bfe.s32 %r12, %r3, 0, 8; +; CHECK-NEXT: selp.b32 %r13, %r11, %r12, %p4; +; CHECK-NEXT: bfe.s32 %r14, %r3, 8, 8; +; CHECK-NEXT: selp.b32 %r15, %r9, %r14, %p3; +; CHECK-NEXT: bfi.b32 %r16, %r15, %r13, 8, 8; +; CHECK-NEXT: bfe.s32 %r17, %r3, 16, 8; +; CHECK-NEXT: selp.b32 %r18, %r7, %r17, %p2; +; CHECK-NEXT: bfi.b32 %r19, %r18, %r16, 16, 8; +; CHECK-NEXT: bfe.s32 %r20, %r3, 24, 8; +; CHECK-NEXT: selp.b32 %r21, %r5, %r20, %p1; +; CHECK-NEXT: bfi.b32 %r22, %r21, %r19, 24, 8; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r22; +; CHECK-NEXT: ret; + %cmp = icmp eq <4 x i8> %a, %b + %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %c + ret <4 x i8> %r +} + +define <4 x i8> @test_ne(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) #0 { +; CHECK-LABEL: test_ne( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<5>; +; CHECK-NEXT: .reg .b32 %r<24>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r3, [test_ne_param_2]; +; CHECK-NEXT: ld.param.u32 %r2, [test_ne_param_1]; +; CHECK-NEXT: ld.param.u32 %r1, [test_ne_param_0]; +; CHECK-NEXT: bfe.s32 %r4, %r2, 24, 8; +; CHECK-NEXT: bfe.s32 %r5, %r1, 24, 8; +; CHECK-NEXT: setp.ne.u32 %p1, %r5, %r4; +; CHECK-NEXT: bfe.s32 %r6, %r2, 16, 8; +; CHECK-NEXT: bfe.s32 %r7, %r1, 16, 8; +; CHECK-NEXT: setp.ne.u32 %p2, %r7, %r6; +; CHECK-NEXT: bfe.s32 %r8, %r2, 8, 8; +; CHECK-NEXT: bfe.s32 %r9, %r1, 8, 8; +; CHECK-NEXT: setp.ne.u32 %p3, %r9, %r8; +; CHECK-NEXT: bfe.s32 %r10, %r2, 0, 8; +; CHECK-NEXT: bfe.s32 %r11, %r1, 0, 8; +; CHECK-NEXT: setp.ne.u32 %p4, %r11, %r10; +; CHECK-NEXT: bfe.s32 %r12, %r3, 0, 8; +; CHECK-NEXT: selp.b32 %r13, %r11, %r12, %p4; +; CHECK-NEXT: bfe.s32 %r14, %r3, 8, 8; +; CHECK-NEXT: selp.b32 %r15, %r9, %r14, %p3; +; CHECK-NEXT: bfi.b32 %r16, %r15, %r13, 8, 8; +; CHECK-NEXT: bfe.s32 %r17, %r3, 16, 8; +; CHECK-NEXT: selp.b32 %r18, %r7, %r17, %p2; +; CHECK-NEXT: bfi.b32 %r19, %r18, %r16, 16, 8; +; CHECK-NEXT: bfe.s32 %r20, %r3, 24, 8; +; CHECK-NEXT: selp.b32 %r21, %r5, %r20, %p1; +; CHECK-NEXT: bfi.b32 %r22, %r21, %r19, 24, 8; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r22; +; CHECK-NEXT: ret; + %cmp = icmp ne <4 x i8> %a, %b + %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %c + ret <4 x i8> %r +} + +define <4 x i8> @test_mul(<4 x i8> %a, <4 x i8> %b) #0 { +; CHECK-LABEL: test_mul( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<13>; +; CHECK-NEXT: .reg .b32 %r<19>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r2, [test_mul_param_1]; +; CHECK-NEXT: ld.param.u32 %r1, [test_mul_param_0]; +; CHECK-NEXT: bfe.s32 %r3, %r2, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; +; CHECK-NEXT: bfe.s32 %r4, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs2, %r4; +; CHECK-NEXT: mul.lo.s16 %rs3, %rs2, %rs1; +; CHECK-NEXT: cvt.u32.u16 %r5, %rs3; +; CHECK-NEXT: bfe.s32 %r6, %r2, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs4, %r6; +; CHECK-NEXT: bfe.s32 %r7, %r1, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; +; CHECK-NEXT: mul.lo.s16 %rs6, %rs5, %rs4; +; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; +; CHECK-NEXT: bfi.b32 %r9, %r8, %r5, 8, 8; +; CHECK-NEXT: bfe.s32 %r10, %r2, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; +; CHECK-NEXT: bfe.s32 %r11, %r1, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs8, %r11; +; CHECK-NEXT: mul.lo.s16 %rs9, %rs8, %rs7; +; CHECK-NEXT: cvt.u32.u16 %r12, %rs9; +; CHECK-NEXT: bfi.b32 %r13, %r12, %r9, 16, 8; +; CHECK-NEXT: bfe.s32 %r14, %r2, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs10, %r14; +; CHECK-NEXT: bfe.s32 %r15, %r1, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs11, %r15; +; CHECK-NEXT: mul.lo.s16 %rs12, %rs11, %rs10; +; CHECK-NEXT: cvt.u32.u16 %r16, %rs12; +; CHECK-NEXT: bfi.b32 %r17, %r16, %r13, 24, 8; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r17; +; CHECK-NEXT: ret; + %r = mul <4 x i8> %a, %b + ret <4 x i8> %r +} + +define <4 x i8> @test_or(<4 x i8> %a, <4 x i8> %b) #0 { +; CHECK-LABEL: test_or( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r3, [test_or_param_1]; +; CHECK-NEXT: ld.param.u32 %r4, [test_or_param_0]; +; CHECK-NEXT: or.b32 %r5, %r4, %r3; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r5; +; CHECK-NEXT: ret; + %r = or <4 x i8> %a, %b + ret <4 x i8> %r +} + +define <4 x i8> @test_or_computed(i8 %a) { +; CHECK-LABEL: test_or_computed( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u8 %rs1, [test_or_computed_param_0]; +; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; +; CHECK-NEXT: bfi.b32 %r2, 0, %r1, 8, 8; +; CHECK-NEXT: bfi.b32 %r3, 0, %r2, 16, 8; +; CHECK-NEXT: bfi.b32 %r4, 0, %r3, 24, 8; +; CHECK-NEXT: bfi.b32 %r6, 5, %r4, 8, 8; +; CHECK-NEXT: or.b32 %r8, %r6, %r4; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r8; +; CHECK-NEXT: ret; + %ins.0 = insertelement <4 x i8> zeroinitializer, i8 %a, i32 0 + %ins.1 = insertelement <4 x i8> %ins.0, i8 5, i32 1 + %r = or <4 x i8> %ins.1, %ins.0 + ret <4 x i8> %r +} + +define <4 x i8> @test_or_imm_0(<4 x i8> %a) #0 { +; CHECK-LABEL: test_or_imm_0( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_or_imm_0_param_0]; +; CHECK-NEXT: or.b32 %r2, %r1, 67305985; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2; +; CHECK-NEXT: ret; + %r = or <4 x i8> , %a + ret <4 x i8> %r +} + +define <4 x i8> @test_or_imm_1(<4 x i8> %a) #0 { +; CHECK-LABEL: test_or_imm_1( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_or_imm_1_param_0]; +; CHECK-NEXT: or.b32 %r2, %r1, 67305985; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2; +; CHECK-NEXT: ret; + %r = or <4 x i8> %a, + ret <4 x i8> %r +} + +define <4 x i8> @test_xor(<4 x i8> %a, <4 x i8> %b) #0 { +; CHECK-LABEL: test_xor( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r3, [test_xor_param_1]; +; CHECK-NEXT: ld.param.u32 %r4, [test_xor_param_0]; +; CHECK-NEXT: xor.b32 %r5, %r4, %r3; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r5; +; CHECK-NEXT: ret; + %r = xor <4 x i8> %a, %b + ret <4 x i8> %r +} + +define <4 x i8> @test_xor_computed(i8 %a) { +; CHECK-LABEL: test_xor_computed( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u8 %rs1, [test_xor_computed_param_0]; +; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; +; CHECK-NEXT: bfi.b32 %r2, 0, %r1, 8, 8; +; CHECK-NEXT: bfi.b32 %r3, 0, %r2, 16, 8; +; CHECK-NEXT: bfi.b32 %r4, 0, %r3, 24, 8; +; CHECK-NEXT: bfi.b32 %r6, 5, %r4, 8, 8; +; CHECK-NEXT: xor.b32 %r8, %r6, %r4; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r8; +; CHECK-NEXT: ret; + %ins.0 = insertelement <4 x i8> zeroinitializer, i8 %a, i32 0 + %ins.1 = insertelement <4 x i8> %ins.0, i8 5, i32 1 + %r = xor <4 x i8> %ins.1, %ins.0 + ret <4 x i8> %r +} + +define <4 x i8> @test_xor_imm_0(<4 x i8> %a) #0 { +; CHECK-LABEL: test_xor_imm_0( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_xor_imm_0_param_0]; +; CHECK-NEXT: xor.b32 %r2, %r1, 67305985; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2; +; CHECK-NEXT: ret; + %r = xor <4 x i8> , %a + ret <4 x i8> %r +} + +define <4 x i8> @test_xor_imm_1(<4 x i8> %a) #0 { +; CHECK-LABEL: test_xor_imm_1( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_xor_imm_1_param_0]; +; CHECK-NEXT: xor.b32 %r2, %r1, 67305985; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2; +; CHECK-NEXT: ret; + %r = xor <4 x i8> %a, + ret <4 x i8> %r +} + +define <4 x i8> @test_and(<4 x i8> %a, <4 x i8> %b) #0 { +; CHECK-LABEL: test_and( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r3, [test_and_param_1]; +; CHECK-NEXT: ld.param.u32 %r4, [test_and_param_0]; +; CHECK-NEXT: and.b32 %r5, %r4, %r3; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r5; +; CHECK-NEXT: ret; + %r = and <4 x i8> %a, %b + ret <4 x i8> %r +} + +define <4 x i8> @test_and_computed(i8 %a) { +; CHECK-LABEL: test_and_computed( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u8 %rs1, [test_and_computed_param_0]; +; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; +; CHECK-NEXT: bfi.b32 %r2, 0, %r1, 8, 8; +; CHECK-NEXT: bfi.b32 %r3, 0, %r2, 16, 8; +; CHECK-NEXT: bfi.b32 %r4, 0, %r3, 24, 8; +; CHECK-NEXT: bfi.b32 %r6, 5, %r4, 8, 8; +; CHECK-NEXT: and.b32 %r8, %r6, %r4; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r8; +; CHECK-NEXT: ret; + %ins.0 = insertelement <4 x i8> zeroinitializer, i8 %a, i32 0 + %ins.1 = insertelement <4 x i8> %ins.0, i8 5, i32 1 + %r = and <4 x i8> %ins.1, %ins.0 + ret <4 x i8> %r +} + +define <4 x i8> @test_and_imm_0(<4 x i8> %a) #0 { +; CHECK-LABEL: test_and_imm_0( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_and_imm_0_param_0]; +; CHECK-NEXT: and.b32 %r2, %r1, 67305985; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2; +; CHECK-NEXT: ret; + %r = and <4 x i8> , %a + ret <4 x i8> %r +} + +define <4 x i8> @test_and_imm_1(<4 x i8> %a) #0 { +; CHECK-LABEL: test_and_imm_1( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_and_imm_1_param_0]; +; CHECK-NEXT: and.b32 %r2, %r1, 67305985; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2; +; CHECK-NEXT: ret; + %r = and <4 x i8> %a, + ret <4 x i8> %r +} + +define void @test_ldst_v2i8(ptr %a, ptr %b) { +; CHECK-LABEL: test_ldst_v2i8( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v2i8_param_1]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v2i8_param_0]; +; CHECK-NEXT: ld.u32 %r1, [%rd1]; +; CHECK-NEXT: st.u32 [%rd2], %r1; +; CHECK-NEXT: ret; + %t1 = load <4 x i8>, ptr %a + store <4 x i8> %t1, ptr %b, align 16 + ret void +} + +define void @test_ldst_v3i8(ptr %a, ptr %b) { +; CHECK-LABEL: test_ldst_v3i8( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v3i8_param_1]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v3i8_param_0]; +; CHECK-NEXT: ld.u32 %r1, [%rd1]; +; CHECK-NEXT: st.u16 [%rd2], %r1; +; CHECK-NEXT: bfe.s32 %r3, %r1, 16, 8; +; CHECK-NEXT: st.u8 [%rd2+2], %r3; +; CHECK-NEXT: ret; + %t1 = load <3 x i8>, ptr %a + store <3 x i8> %t1, ptr %b, align 16 + ret void +} + +define void @test_ldst_v4i8(ptr %a, ptr %b) { +; CHECK-LABEL: test_ldst_v4i8( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v4i8_param_1]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v4i8_param_0]; +; CHECK-NEXT: ld.u32 %r1, [%rd1]; +; CHECK-NEXT: st.u32 [%rd2], %r1; +; CHECK-NEXT: ret; + %t1 = load <4 x i8>, ptr %a + store <4 x i8> %t1, ptr %b, align 16 + ret void +} + +define void @test_ldst_v4i8_unaligned(ptr %a, ptr %b) { +; CHECK-LABEL: test_ldst_v4i8_unaligned( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v4i8_unaligned_param_1]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v4i8_unaligned_param_0]; +; CHECK-NEXT: ld.u8 %r1, [%rd1]; +; CHECK-NEXT: ld.u8 %r2, [%rd1+1]; +; CHECK-NEXT: ld.u8 %r3, [%rd1+2]; +; CHECK-NEXT: ld.u8 %r4, [%rd1+3]; +; CHECK-NEXT: st.u8 [%rd2+3], %r4; +; CHECK-NEXT: st.u8 [%rd2+2], %r3; +; CHECK-NEXT: st.u8 [%rd2+1], %r2; +; CHECK-NEXT: st.u8 [%rd2], %r1; +; CHECK-NEXT: ret; + %t1 = load <4 x i8>, ptr %a, align 1 + store <4 x i8> %t1, ptr %b, align 1 + ret void +} + + +define void @test_ldst_v8i8(ptr %a, ptr %b) { +; CHECK-LABEL: test_ldst_v8i8( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v8i8_param_1]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v8i8_param_0]; +; CHECK-NEXT: ld.u32 %r1, [%rd1]; +; CHECK-NEXT: ld.u32 %r2, [%rd1+4]; +; CHECK-NEXT: st.u32 [%rd2+4], %r2; +; CHECK-NEXT: st.u32 [%rd2], %r1; +; CHECK-NEXT: ret; + %t1 = load <8 x i8>, ptr %a + store <8 x i8> %t1, ptr %b, align 16 + ret void +} + +declare <4 x i8> @test_callee(<4 x i8> %a, <4 x i8> %b) #0 + +define <4 x i8> @test_call(<4 x i8> %a, <4 x i8> %b) #0 { +; CHECK-LABEL: test_call( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r2, [test_call_param_1]; +; CHECK-NEXT: ld.param.u32 %r1, [test_call_param_0]; +; CHECK-NEXT: { // callseq 0, 0 +; CHECK-NEXT: .reg .b32 temp_param_reg; +; CHECK-NEXT: .param .align 4 .b8 param0[4]; +; CHECK-NEXT: st.param.b32 [param0+0], %r1; +; CHECK-NEXT: .param .align 4 .b8 param1[4]; +; CHECK-NEXT: st.param.b32 [param1+0], %r2; +; CHECK-NEXT: .param .align 4 .b8 retval0[4]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_callee, +; CHECK-NEXT: ( +; CHECK-NEXT: param0, +; CHECK-NEXT: param1 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.b32 %r3, [retval0+0]; +; CHECK-NEXT: } // callseq 0 +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r3; +; CHECK-NEXT: ret; + %r = call <4 x i8> @test_callee(<4 x i8> %a, <4 x i8> %b) + ret <4 x i8> %r +} + +define <4 x i8> @test_call_flipped(<4 x i8> %a, <4 x i8> %b) #0 { +; CHECK-LABEL: test_call_flipped( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r2, [test_call_flipped_param_1]; +; CHECK-NEXT: ld.param.u32 %r1, [test_call_flipped_param_0]; +; CHECK-NEXT: { // callseq 1, 0 +; CHECK-NEXT: .reg .b32 temp_param_reg; +; CHECK-NEXT: .param .align 4 .b8 param0[4]; +; CHECK-NEXT: st.param.b32 [param0+0], %r2; +; CHECK-NEXT: .param .align 4 .b8 param1[4]; +; CHECK-NEXT: st.param.b32 [param1+0], %r1; +; CHECK-NEXT: .param .align 4 .b8 retval0[4]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_callee, +; CHECK-NEXT: ( +; CHECK-NEXT: param0, +; CHECK-NEXT: param1 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.b32 %r3, [retval0+0]; +; CHECK-NEXT: } // callseq 1 +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r3; +; CHECK-NEXT: ret; + %r = call <4 x i8> @test_callee(<4 x i8> %b, <4 x i8> %a) + ret <4 x i8> %r +} + +define <4 x i8> @test_tailcall_flipped(<4 x i8> %a, <4 x i8> %b) #0 { +; CHECK-LABEL: test_tailcall_flipped( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r2, [test_tailcall_flipped_param_1]; +; CHECK-NEXT: ld.param.u32 %r1, [test_tailcall_flipped_param_0]; +; CHECK-NEXT: { // callseq 2, 0 +; CHECK-NEXT: .reg .b32 temp_param_reg; +; CHECK-NEXT: .param .align 4 .b8 param0[4]; +; CHECK-NEXT: st.param.b32 [param0+0], %r2; +; CHECK-NEXT: .param .align 4 .b8 param1[4]; +; CHECK-NEXT: st.param.b32 [param1+0], %r1; +; CHECK-NEXT: .param .align 4 .b8 retval0[4]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_callee, +; CHECK-NEXT: ( +; CHECK-NEXT: param0, +; CHECK-NEXT: param1 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.b32 %r3, [retval0+0]; +; CHECK-NEXT: } // callseq 2 +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r3; +; CHECK-NEXT: ret; + %r = tail call <4 x i8> @test_callee(<4 x i8> %b, <4 x i8> %a) + ret <4 x i8> %r +} + +define <4 x i8> @test_select(<4 x i8> %a, <4 x i8> %b, i1 zeroext %c) #0 { +; CHECK-LABEL: test_select( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u8 %rs1, [test_select_param_2]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.eq.b16 %p1, %rs2, 1; +; CHECK-NEXT: ld.param.u32 %r2, [test_select_param_1]; +; CHECK-NEXT: ld.param.u32 %r1, [test_select_param_0]; +; CHECK-NEXT: selp.b32 %r3, %r1, %r2, %p1; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r3; +; CHECK-NEXT: ret; + %r = select i1 %c, <4 x i8> %a, <4 x i8> %b + ret <4 x i8> %r +} + +define <4 x i8> @test_select_cc(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) #0 { +; CHECK-LABEL: test_select_cc( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<5>; +; CHECK-NEXT: .reg .b32 %r<29>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r4, [test_select_cc_param_3]; +; CHECK-NEXT: ld.param.u32 %r3, [test_select_cc_param_2]; +; CHECK-NEXT: ld.param.u32 %r2, [test_select_cc_param_1]; +; CHECK-NEXT: ld.param.u32 %r1, [test_select_cc_param_0]; +; CHECK-NEXT: bfe.s32 %r5, %r4, 24, 8; +; CHECK-NEXT: bfe.s32 %r6, %r3, 24, 8; +; CHECK-NEXT: setp.ne.u32 %p1, %r6, %r5; +; CHECK-NEXT: bfe.s32 %r7, %r4, 16, 8; +; CHECK-NEXT: bfe.s32 %r8, %r3, 16, 8; +; CHECK-NEXT: setp.ne.u32 %p2, %r8, %r7; +; CHECK-NEXT: bfe.s32 %r9, %r4, 8, 8; +; CHECK-NEXT: bfe.s32 %r10, %r3, 8, 8; +; CHECK-NEXT: setp.ne.u32 %p3, %r10, %r9; +; CHECK-NEXT: bfe.s32 %r11, %r4, 0, 8; +; CHECK-NEXT: bfe.s32 %r12, %r3, 0, 8; +; CHECK-NEXT: setp.ne.u32 %p4, %r12, %r11; +; CHECK-NEXT: bfe.s32 %r13, %r2, 0, 8; +; CHECK-NEXT: bfe.s32 %r14, %r1, 0, 8; +; CHECK-NEXT: selp.b32 %r15, %r14, %r13, %p4; +; CHECK-NEXT: bfe.s32 %r16, %r2, 8, 8; +; CHECK-NEXT: bfe.s32 %r17, %r1, 8, 8; +; CHECK-NEXT: selp.b32 %r18, %r17, %r16, %p3; +; CHECK-NEXT: bfi.b32 %r19, %r18, %r15, 8, 8; +; CHECK-NEXT: bfe.s32 %r20, %r2, 16, 8; +; CHECK-NEXT: bfe.s32 %r21, %r1, 16, 8; +; CHECK-NEXT: selp.b32 %r22, %r21, %r20, %p2; +; CHECK-NEXT: bfi.b32 %r23, %r22, %r19, 16, 8; +; CHECK-NEXT: bfe.s32 %r24, %r2, 24, 8; +; CHECK-NEXT: bfe.s32 %r25, %r1, 24, 8; +; CHECK-NEXT: selp.b32 %r26, %r25, %r24, %p1; +; CHECK-NEXT: bfi.b32 %r27, %r26, %r23, 24, 8; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r27; +; CHECK-NEXT: ret; + %cc = icmp ne <4 x i8> %c, %d + %r = select <4 x i1> %cc, <4 x i8> %a, <4 x i8> %b + ret <4 x i8> %r +} + +define <4 x i32> @test_select_cc_i32_i8(<4 x i32> %a, <4 x i32> %b, +; CHECK-LABEL: test_select_cc_i32_i8( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<5>; +; CHECK-NEXT: .reg .b32 %r<23>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [test_select_cc_i32_i8_param_1]; +; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [test_select_cc_i32_i8_param_0]; +; CHECK-NEXT: ld.param.u32 %r10, [test_select_cc_i32_i8_param_3]; +; CHECK-NEXT: ld.param.u32 %r9, [test_select_cc_i32_i8_param_2]; +; CHECK-NEXT: bfe.s32 %r11, %r10, 0, 8; +; CHECK-NEXT: bfe.s32 %r12, %r9, 0, 8; +; CHECK-NEXT: setp.ne.u32 %p1, %r12, %r11; +; CHECK-NEXT: bfe.s32 %r13, %r10, 8, 8; +; CHECK-NEXT: bfe.s32 %r14, %r9, 8, 8; +; CHECK-NEXT: setp.ne.u32 %p2, %r14, %r13; +; CHECK-NEXT: bfe.s32 %r15, %r10, 16, 8; +; CHECK-NEXT: bfe.s32 %r16, %r9, 16, 8; +; CHECK-NEXT: setp.ne.u32 %p3, %r16, %r15; +; CHECK-NEXT: bfe.s32 %r17, %r10, 24, 8; +; CHECK-NEXT: bfe.s32 %r18, %r9, 24, 8; +; CHECK-NEXT: setp.ne.u32 %p4, %r18, %r17; +; CHECK-NEXT: selp.b32 %r19, %r4, %r8, %p4; +; CHECK-NEXT: selp.b32 %r20, %r3, %r7, %p3; +; CHECK-NEXT: selp.b32 %r21, %r2, %r6, %p2; +; CHECK-NEXT: selp.b32 %r22, %r1, %r5, %p1; +; CHECK-NEXT: st.param.v4.b32 [func_retval0+0], {%r22, %r21, %r20, %r19}; +; CHECK-NEXT: ret; + <4 x i8> %c, <4 x i8> %d) #0 { + %cc = icmp ne <4 x i8> %c, %d + %r = select <4 x i1> %cc, <4 x i32> %a, <4 x i32> %b + ret <4 x i32> %r +} + +define <4 x i8> @test_select_cc_i8_i32(<4 x i8> %a, <4 x i8> %b, +; CHECK-LABEL: test_select_cc_i8_i32( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<5>; +; CHECK-NEXT: .reg .b32 %r<27>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v4.u32 {%r7, %r8, %r9, %r10}, [test_select_cc_i8_i32_param_3]; +; CHECK-NEXT: ld.param.v4.u32 {%r3, %r4, %r5, %r6}, [test_select_cc_i8_i32_param_2]; +; CHECK-NEXT: ld.param.u32 %r2, [test_select_cc_i8_i32_param_1]; +; CHECK-NEXT: ld.param.u32 %r1, [test_select_cc_i8_i32_param_0]; +; CHECK-NEXT: setp.ne.s32 %p1, %r6, %r10; +; CHECK-NEXT: setp.ne.s32 %p2, %r5, %r9; +; CHECK-NEXT: setp.ne.s32 %p3, %r4, %r8; +; CHECK-NEXT: setp.ne.s32 %p4, %r3, %r7; +; CHECK-NEXT: bfe.s32 %r11, %r2, 0, 8; +; CHECK-NEXT: bfe.s32 %r12, %r1, 0, 8; +; CHECK-NEXT: selp.b32 %r13, %r12, %r11, %p4; +; CHECK-NEXT: bfe.s32 %r14, %r2, 8, 8; +; CHECK-NEXT: bfe.s32 %r15, %r1, 8, 8; +; CHECK-NEXT: selp.b32 %r16, %r15, %r14, %p3; +; CHECK-NEXT: bfi.b32 %r17, %r16, %r13, 8, 8; +; CHECK-NEXT: bfe.s32 %r18, %r2, 16, 8; +; CHECK-NEXT: bfe.s32 %r19, %r1, 16, 8; +; CHECK-NEXT: selp.b32 %r20, %r19, %r18, %p2; +; CHECK-NEXT: bfi.b32 %r21, %r20, %r17, 16, 8; +; CHECK-NEXT: bfe.s32 %r22, %r2, 24, 8; +; CHECK-NEXT: bfe.s32 %r23, %r1, 24, 8; +; CHECK-NEXT: selp.b32 %r24, %r23, %r22, %p1; +; CHECK-NEXT: bfi.b32 %r25, %r24, %r21, 24, 8; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r25; +; CHECK-NEXT: ret; + <4 x i32> %c, <4 x i32> %d) #0 { + %cc = icmp ne <4 x i32> %c, %d + %r = select <4 x i1> %cc, <4 x i8> %a, <4 x i8> %b + ret <4 x i8> %r +} + + +define <4 x i8> @test_trunc_2xi32(<4 x i32> %a) #0 { +; CHECK-LABEL: test_trunc_2xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [test_trunc_2xi32_param_0]; +; CHECK-NEXT: bfi.b32 %r5, %r2, %r1, 8, 8; +; CHECK-NEXT: bfi.b32 %r6, %r3, %r5, 16, 8; +; CHECK-NEXT: bfi.b32 %r7, %r4, %r6, 24, 8; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r7; +; CHECK-NEXT: ret; + %r = trunc <4 x i32> %a to <4 x i8> + ret <4 x i8> %r +} + +define <4 x i8> @test_trunc_2xi64(<4 x i64> %a) #0 { +; CHECK-LABEL: test_trunc_2xi64( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.u64 {%rd3, %rd4}, [test_trunc_2xi64_param_0+16]; +; CHECK-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [test_trunc_2xi64_param_0]; +; CHECK-NEXT: cvt.u32.u64 %r1, %rd1; +; CHECK-NEXT: cvt.u32.u64 %r2, %rd2; +; CHECK-NEXT: bfi.b32 %r3, %r2, %r1, 8, 8; +; CHECK-NEXT: cvt.u32.u64 %r4, %rd3; +; CHECK-NEXT: bfi.b32 %r5, %r4, %r3, 16, 8; +; CHECK-NEXT: cvt.u32.u64 %r6, %rd4; +; CHECK-NEXT: bfi.b32 %r7, %r6, %r5, 24, 8; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r7; +; CHECK-NEXT: ret; + %r = trunc <4 x i64> %a to <4 x i8> + ret <4 x i8> %r +} + +define <4 x i32> @test_zext_2xi32(<4 x i8> %a) #0 { +; CHECK-LABEL: test_zext_2xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_zext_2xi32_param_0]; +; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; +; CHECK-NEXT: bfe.u32 %r3, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r5, %r1, 0, 8; +; CHECK-NEXT: st.param.v4.b32 [func_retval0+0], {%r5, %r4, %r3, %r2}; +; CHECK-NEXT: ret; + %r = zext <4 x i8> %a to <4 x i32> + ret <4 x i32> %r +} + +define <4 x i64> @test_zext_2xi64(<4 x i8> %a) #0 { +; CHECK-LABEL: test_zext_2xi64( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<6>; +; CHECK-NEXT: .reg .b64 %rd<9>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_zext_2xi64_param_0]; +; CHECK-NEXT: bfe.s32 %r2, %r1, 24, 8; +; CHECK-NEXT: cvt.u64.u32 %rd1, %r2; +; CHECK-NEXT: and.b64 %rd2, %rd1, 255; +; CHECK-NEXT: bfe.s32 %r3, %r1, 16, 8; +; CHECK-NEXT: cvt.u64.u32 %rd3, %r3; +; CHECK-NEXT: and.b64 %rd4, %rd3, 255; +; CHECK-NEXT: bfe.s32 %r4, %r1, 8, 8; +; CHECK-NEXT: cvt.u64.u32 %rd5, %r4; +; CHECK-NEXT: and.b64 %rd6, %rd5, 255; +; CHECK-NEXT: bfe.s32 %r5, %r1, 0, 8; +; CHECK-NEXT: cvt.u64.u32 %rd7, %r5; +; CHECK-NEXT: and.b64 %rd8, %rd7, 255; +; CHECK-NEXT: st.param.v2.b64 [func_retval0+0], {%rd8, %rd6}; +; CHECK-NEXT: st.param.v2.b64 [func_retval0+16], {%rd4, %rd2}; +; CHECK-NEXT: ret; + %r = zext <4 x i8> %a to <4 x i64> + ret <4 x i64> %r +} + +define <4 x i8> @test_bitcast_i32_to_2xi8(i32 %a) #0 { +; CHECK-LABEL: test_bitcast_i32_to_2xi8( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_bitcast_i32_to_2xi8_param_0]; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1; +; CHECK-NEXT: ret; + %r = bitcast i32 %a to <4 x i8> + ret <4 x i8> %r +} + +define i32 @test_bitcast_2xi8_to_i32(<4 x i8> %a) #0 { +; CHECK-LABEL: test_bitcast_2xi8_to_i32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r2, [test_bitcast_2xi8_to_i32_param_0]; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2; +; CHECK-NEXT: ret; + %r = bitcast <4 x i8> %a to i32 + ret i32 %r +} + +define <2 x half> @test_bitcast_2xi8_to_2xhalf(i8 %a) #0 { +; CHECK-LABEL: test_bitcast_2xi8_to_2xhalf( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u8 %rs1, [test_bitcast_2xi8_to_2xhalf_param_0]; +; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; +; CHECK-NEXT: bfi.b32 %r2, 5, %r1, 8, 8; +; CHECK-NEXT: bfi.b32 %r3, 6, %r2, 16, 8; +; CHECK-NEXT: bfi.b32 %r4, 7, %r3, 24, 8; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r4; +; CHECK-NEXT: ret; + %ins.0 = insertelement <4 x i8> undef, i8 %a, i32 0 + %ins.1 = insertelement <4 x i8> %ins.0, i8 5, i32 1 + %ins.2 = insertelement <4 x i8> %ins.1, i8 6, i32 2 + %ins.3 = insertelement <4 x i8> %ins.2, i8 7, i32 3 + %r = bitcast <4 x i8> %ins.3 to <2 x half> + ret <2 x half> %r +} + + +define <4 x i8> @test_shufflevector(<4 x i8> %a) #0 { +; CHECK-LABEL: test_shufflevector( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_shufflevector_param_0]; +; CHECK-NEXT: // implicit-def: %r3 +; CHECK-NEXT: prmt.b32 %r2, %r1, %r3, 291; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2; +; CHECK-NEXT: ret; + %s = shufflevector <4 x i8> %a, <4 x i8> undef, <4 x i32> + ret <4 x i8> %s +} + +define <4 x i8> @test_shufflevector_2(<4 x i8> %a, <4 x i8> %b) #0 { +; CHECK-LABEL: test_shufflevector_2( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r2, [test_shufflevector_2_param_1]; +; CHECK-NEXT: ld.param.u32 %r1, [test_shufflevector_2_param_0]; +; CHECK-NEXT: prmt.b32 %r3, %r1, %r2, 9527; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r3; +; CHECK-NEXT: ret; + %s = shufflevector <4 x i8> %a, <4 x i8> %b, <4 x i32> + ret <4 x i8> %s +} + + +define <4 x i8> @test_insertelement(<4 x i8> %a, i8 %x) #0 { +; CHECK-LABEL: test_insertelement( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u8 %rs1, [test_insertelement_param_1]; +; CHECK-NEXT: ld.param.u32 %r1, [test_insertelement_param_0]; +; CHECK-NEXT: cvt.u32.u16 %r2, %rs1; +; CHECK-NEXT: bfi.b32 %r3, %r2, %r1, 8, 8; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r3; +; CHECK-NEXT: ret; + %i = insertelement <4 x i8> %a, i8 %x, i64 1 + ret <4 x i8> %i +} + +define <4 x i8> @test_fptosi_2xhalf_to_2xi8(<4 x half> %a) #0 { +; CHECK-LABEL: test_fptosi_2xhalf_to_2xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<13>; +; CHECK-NEXT: .reg .b32 %r<15>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.u32 {%r3, %r4}, [test_fptosi_2xhalf_to_2xi8_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r3; +; CHECK-NEXT: cvt.rzi.s16.f16 %rs3, %rs2; +; CHECK-NEXT: cvt.rzi.s16.f16 %rs4, %rs1; +; CHECK-NEXT: mov.b32 %r5, {%rs4, %rs3}; +; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r5; +; CHECK-NEXT: cvt.u32.u16 %r6, %rs5; +; CHECK-NEXT: cvt.u32.u16 %r7, %rs6; +; CHECK-NEXT: bfi.b32 %r8, %r7, %r6, 8, 8; +; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r4; +; CHECK-NEXT: cvt.rzi.s16.f16 %rs9, %rs8; +; CHECK-NEXT: cvt.rzi.s16.f16 %rs10, %rs7; +; CHECK-NEXT: mov.b32 %r9, {%rs10, %rs9}; +; CHECK-NEXT: mov.b32 {%rs11, %rs12}, %r9; +; CHECK-NEXT: cvt.u32.u16 %r10, %rs11; +; CHECK-NEXT: bfi.b32 %r11, %r10, %r8, 16, 8; +; CHECK-NEXT: cvt.u32.u16 %r12, %rs12; +; CHECK-NEXT: bfi.b32 %r13, %r12, %r11, 24, 8; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r13; +; CHECK-NEXT: ret; + %r = fptosi <4 x half> %a to <4 x i8> + ret <4 x i8> %r +} + +define <4 x i8> @test_fptoui_2xhalf_to_2xi8(<4 x half> %a) #0 { +; CHECK-LABEL: test_fptoui_2xhalf_to_2xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<13>; +; CHECK-NEXT: .reg .b32 %r<15>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.u32 {%r3, %r4}, [test_fptoui_2xhalf_to_2xi8_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r3; +; CHECK-NEXT: cvt.rzi.u16.f16 %rs3, %rs2; +; CHECK-NEXT: cvt.rzi.u16.f16 %rs4, %rs1; +; CHECK-NEXT: mov.b32 %r5, {%rs4, %rs3}; +; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r5; +; CHECK-NEXT: cvt.u32.u16 %r6, %rs5; +; CHECK-NEXT: cvt.u32.u16 %r7, %rs6; +; CHECK-NEXT: bfi.b32 %r8, %r7, %r6, 8, 8; +; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r4; +; CHECK-NEXT: cvt.rzi.u16.f16 %rs9, %rs8; +; CHECK-NEXT: cvt.rzi.u16.f16 %rs10, %rs7; +; CHECK-NEXT: mov.b32 %r9, {%rs10, %rs9}; +; CHECK-NEXT: mov.b32 {%rs11, %rs12}, %r9; +; CHECK-NEXT: cvt.u32.u16 %r10, %rs11; +; CHECK-NEXT: bfi.b32 %r11, %r10, %r8, 16, 8; +; CHECK-NEXT: cvt.u32.u16 %r12, %rs12; +; CHECK-NEXT: bfi.b32 %r13, %r12, %r11, 24, 8; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r13; +; CHECK-NEXT: ret; + %r = fptoui <4 x half> %a to <4 x i8> + ret <4 x i8> %r +} + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll b/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll index 9012339fb6b1e..98ab93774588d 100644 --- a/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll +++ b/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll @@ -130,9 +130,9 @@ define void @foo12(ptr noalias readonly %from, ptr %to) { } ; SM20-LABEL: .visible .entry foo13( -; SM20: ld.global.v4.u8 +; SM20: ld.global.u32 ; SM35-LABEL: .visible .entry foo13( -; SM35: ld.global.nc.v4.u8 +; SM35: ld.global.nc.u32 define void @foo13(ptr noalias readonly %from, ptr %to) { %1 = load <4 x i8>, ptr %from store <4 x i8> %1, ptr %to diff --git a/llvm/test/CodeGen/NVPTX/param-load-store.ll b/llvm/test/CodeGen/NVPTX/param-load-store.ll index 2d87271e30ae0..b4208c691c91d 100644 --- a/llvm/test/CodeGen/NVPTX/param-load-store.ll +++ b/llvm/test/CodeGen/NVPTX/param-load-store.ll @@ -212,18 +212,16 @@ define signext i8 @test_i8s(i8 signext %a) { ; CHECK: .func (.param .align 4 .b8 func_retval0[4]) ; CHECK-LABEL: test_v3i8( ; CHECK-NEXT: .param .align 4 .b8 test_v3i8_param_0[4] -; CHECK-DAG: ld.param.u8 [[E2:%rs[0-9]+]], [test_v3i8_param_0+2]; -; CHECK-DAG: ld.param.v2.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [test_v3i8_param_0]; +; CHECK: ld.param.u32 [[R:%r[0-9]+]], [test_v3i8_param_0]; ; CHECK: .param .align 4 .b8 param0[4]; -; CHECK: st.param.v2.b8 [param0+0], {[[E0]], [[E1]]}; -; CHECK: st.param.b8 [param0+2], [[E2]]; +; CHECK: st.param.b32 [param0+0], [[R]] ; CHECK: .param .align 4 .b8 retval0[4]; ; CHECK: call.uni (retval0), ; CHECK-NEXT: test_v3i8, -; CHECK-DAG: ld.param.v2.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]]}, [retval0+0]; -; CHECK-DAG: ld.param.b8 [[RE2:%rs[0-9]+]], [retval0+2]; -; CHECK-DAG: st.param.v2.b8 [func_retval0+0], {[[RE0]], [[RE1]]}; -; CHECK-DAG: st.param.b8 [func_retval0+2], [[RE2]]; +; CHECK: ld.param.b32 [[RE:%r[0-9]+]], [retval0+0]; +; v4i8/i32->{v3i8 elements}->v4i8/i32 conversion is messy and not very +; interesting here, so it's skipped. +; CHECK: st.param.b32 [func_retval0+0], ; CHECK-NEXT: ret; define <3 x i8> @test_v3i8(<3 x i8> %a) { %r = tail call <3 x i8> @test_v3i8(<3 x i8> %a); @@ -233,14 +231,14 @@ define <3 x i8> @test_v3i8(<3 x i8> %a) { ; CHECK: .func (.param .align 4 .b8 func_retval0[4]) ; CHECK-LABEL: test_v4i8( ; CHECK-NEXT: .param .align 4 .b8 test_v4i8_param_0[4] -; CHECK: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v4i8_param_0] +; CHECK: ld.param.u32 [[R:%r[0-9]+]], [test_v4i8_param_0] ; CHECK: .param .align 4 .b8 param0[4]; -; CHECK: st.param.v4.b8 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK: st.param.b32 [param0+0], [[R]]; ; CHECK: .param .align 4 .b8 retval0[4]; ; CHECK: call.uni (retval0), ; CHECK-NEXT: test_v4i8, -; CHECK: ld.param.v4.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0]; -; CHECK: st.param.v4.b8 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} +; CHECK: ld.param.b32 [[RET:%r[0-9]+]], [retval0+0]; +; CHECK: st.param.b32 [func_retval0+0], [[RET]]; ; CHECK-NEXT: ret; define <4 x i8> @test_v4i8(<4 x i8> %a) { %r = tail call <4 x i8> @test_v4i8(<4 x i8> %a); @@ -250,10 +248,10 @@ define <4 x i8> @test_v4i8(<4 x i8> %a) { ; CHECK: .func (.param .align 8 .b8 func_retval0[8]) ; CHECK-LABEL: test_v5i8( ; CHECK-NEXT: .param .align 8 .b8 test_v5i8_param_0[8] +; CHECK-DAG: ld.param.u32 [[E0:%r[0-9]+]], [test_v5i8_param_0] ; CHECK-DAG: ld.param.u8 [[E4:%rs[0-9]+]], [test_v5i8_param_0+4]; -; CHECK-DAG: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i8_param_0] ; CHECK: .param .align 8 .b8 param0[8]; -; CHECK-DAG: st.param.v4.b8 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK-DAG: st.param.v4.b8 [param0+0], ; CHECK-DAG: st.param.b8 [param0+4], [[E4]]; ; CHECK: .param .align 8 .b8 retval0[8]; ; CHECK: call.uni (retval0), diff --git a/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll b/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll index 16579de882ed4..8633b09af0487 100644 --- a/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll +++ b/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll @@ -60,46 +60,17 @@ define <1 x i16> @out_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwin define <4 x i8> @out_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind { ; CHECK-LABEL: out_v4i8( ; CHECK: { -; CHECK-NEXT: .local .align 2 .b8 __local_depot2[4]; -; CHECK-NEXT: .reg .b64 %SP; -; CHECK-NEXT: .reg .b64 %SPL; -; CHECK-NEXT: .reg .b16 %rs<20>; -; CHECK-NEXT: .reg .b32 %r<21>; +; CHECK-NEXT: .reg .b32 %r<10>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: mov.u64 %SPL, __local_depot2; -; CHECK-NEXT: cvta.local.u64 %SP, %SPL; -; CHECK-NEXT: ld.param.v4.u8 {%rs1, %rs2, %rs3, %rs4}, [out_v4i8_param_0]; -; CHECK-NEXT: mov.b32 %r1, {%rs3, %rs4}; -; CHECK-NEXT: mov.b32 %r2, {%rs1, %rs2}; -; CHECK-NEXT: ld.param.v4.u8 {%rs5, %rs6, %rs7, %rs8}, [out_v4i8_param_2]; -; CHECK-NEXT: mov.b32 %r3, {%rs5, %rs6}; -; CHECK-NEXT: and.b32 %r4, %r2, %r3; -; CHECK-NEXT: mov.b32 %r5, {%rs7, %rs8}; -; CHECK-NEXT: and.b32 %r6, %r1, %r5; -; CHECK-NEXT: ld.param.v4.u8 {%rs9, %rs10, %rs11, %rs12}, [out_v4i8_param_1]; -; CHECK-NEXT: mov.b32 %r7, {%rs11, %rs12}; -; CHECK-NEXT: mov.b32 %r8, {%rs9, %rs10}; -; CHECK-NEXT: xor.b32 %r9, %r5, 16711935; -; CHECK-NEXT: xor.b32 %r10, %r3, 16711935; -; CHECK-NEXT: and.b32 %r11, %r8, %r10; -; CHECK-NEXT: and.b32 %r12, %r7, %r9; -; CHECK-NEXT: or.b32 %r13, %r6, %r12; -; CHECK-NEXT: mov.b32 {%rs13, %rs14}, %r13; -; CHECK-NEXT: st.v2.u8 [%SP+0], {%rs13, %rs14}; -; CHECK-NEXT: or.b32 %r14, %r4, %r11; -; CHECK-NEXT: mov.b32 {%rs15, %rs16}, %r14; -; CHECK-NEXT: st.v2.u8 [%SP+2], {%rs15, %rs16}; -; CHECK-NEXT: ld.u16 %r15, [%SP+0]; -; CHECK-NEXT: shl.b32 %r16, %r15, 16; -; CHECK-NEXT: ld.u16 %r17, [%SP+2]; -; CHECK-NEXT: or.b32 %r18, %r17, %r16; -; CHECK-NEXT: shr.u32 %r19, %r18, 8; -; CHECK-NEXT: cvt.u16.u32 %rs17, %r19; -; CHECK-NEXT: cvt.u16.u32 %rs18, %r15; -; CHECK-NEXT: bfe.s32 %r20, %r15, 8, 8; -; CHECK-NEXT: cvt.u16.u32 %rs19, %r20; -; CHECK-NEXT: st.param.v4.b8 [func_retval0+0], {%rs15, %rs17, %rs18, %rs19}; +; CHECK-NEXT: ld.param.u32 %r1, [out_v4i8_param_2]; +; CHECK-NEXT: ld.param.u32 %r3, [out_v4i8_param_1]; +; CHECK-NEXT: ld.param.u32 %r4, [out_v4i8_param_0]; +; CHECK-NEXT: and.b32 %r5, %r4, %r1; +; CHECK-NEXT: xor.b32 %r7, %r1, -1; +; CHECK-NEXT: and.b32 %r8, %r3, %r7; +; CHECK-NEXT: or.b32 %r9, %r5, %r8; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r9; ; CHECK-NEXT: ret; %mx = and <4 x i8> %x, %mask %notmask = xor <4 x i8> %mask, @@ -111,48 +82,17 @@ define <4 x i8> @out_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind { define <4 x i8> @out_v4i8_undef(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind { ; CHECK-LABEL: out_v4i8_undef( ; CHECK: { -; CHECK-NEXT: .local .align 2 .b8 __local_depot3[4]; -; CHECK-NEXT: .reg .b64 %SP; -; CHECK-NEXT: .reg .b64 %SPL; -; CHECK-NEXT: .reg .b16 %rs<22>; -; CHECK-NEXT: .reg .b32 %r<22>; +; CHECK-NEXT: .reg .b32 %r<10>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: mov.u64 %SPL, __local_depot3; -; CHECK-NEXT: cvta.local.u64 %SP, %SPL; -; CHECK-NEXT: ld.param.v4.u8 {%rs1, %rs2, %rs3, %rs4}, [out_v4i8_undef_param_0]; -; CHECK-NEXT: mov.b32 %r1, {%rs3, %rs4}; -; CHECK-NEXT: mov.b32 %r2, {%rs1, %rs2}; -; CHECK-NEXT: ld.param.v4.u8 {%rs5, %rs6, %rs7, %rs8}, [out_v4i8_undef_param_2]; -; CHECK-NEXT: mov.b32 %r3, {%rs5, %rs6}; -; CHECK-NEXT: and.b32 %r4, %r2, %r3; -; CHECK-NEXT: mov.b32 %r5, {%rs7, %rs8}; -; CHECK-NEXT: and.b32 %r6, %r1, %r5; -; CHECK-NEXT: ld.param.v4.u8 {%rs9, %rs10, %rs11, %rs12}, [out_v4i8_undef_param_1]; -; CHECK-NEXT: mov.b32 %r7, {%rs11, %rs12}; -; CHECK-NEXT: mov.b32 %r8, {%rs9, %rs10}; -; CHECK-NEXT: mov.u16 %rs13, 255; -; CHECK-NEXT: mov.b32 %r9, {%rs14, %rs13}; -; CHECK-NEXT: xor.b32 %r10, %r5, %r9; -; CHECK-NEXT: xor.b32 %r11, %r3, 16711935; -; CHECK-NEXT: and.b32 %r12, %r8, %r11; -; CHECK-NEXT: and.b32 %r13, %r7, %r10; -; CHECK-NEXT: or.b32 %r14, %r6, %r13; -; CHECK-NEXT: mov.b32 {%rs15, %rs16}, %r14; -; CHECK-NEXT: st.v2.u8 [%SP+0], {%rs15, %rs16}; -; CHECK-NEXT: or.b32 %r15, %r4, %r12; -; CHECK-NEXT: mov.b32 {%rs17, %rs18}, %r15; -; CHECK-NEXT: st.v2.u8 [%SP+2], {%rs17, %rs18}; -; CHECK-NEXT: ld.u16 %r16, [%SP+0]; -; CHECK-NEXT: shl.b32 %r17, %r16, 16; -; CHECK-NEXT: ld.u16 %r18, [%SP+2]; -; CHECK-NEXT: or.b32 %r19, %r18, %r17; -; CHECK-NEXT: shr.u32 %r20, %r19, 8; -; CHECK-NEXT: cvt.u16.u32 %rs19, %r20; -; CHECK-NEXT: cvt.u16.u32 %rs20, %r16; -; CHECK-NEXT: bfe.s32 %r21, %r16, 8, 8; -; CHECK-NEXT: cvt.u16.u32 %rs21, %r21; -; CHECK-NEXT: st.param.v4.b8 [func_retval0+0], {%rs17, %rs19, %rs20, %rs21}; +; CHECK-NEXT: ld.param.u32 %r1, [out_v4i8_undef_param_2]; +; CHECK-NEXT: ld.param.u32 %r3, [out_v4i8_undef_param_1]; +; CHECK-NEXT: ld.param.u32 %r4, [out_v4i8_undef_param_0]; +; CHECK-NEXT: and.b32 %r5, %r4, %r1; +; CHECK-NEXT: xor.b32 %r7, %r1, -16711681; +; CHECK-NEXT: and.b32 %r8, %r3, %r7; +; CHECK-NEXT: or.b32 %r9, %r5, %r8; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r9; ; CHECK-NEXT: ret; %mx = and <4 x i8> %x, %mask %notmask = xor <4 x i8> %mask, @@ -212,84 +152,21 @@ define <1 x i32> @out_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwin define <8 x i8> @out_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind { ; CHECK-LABEL: out_v8i8( ; CHECK: { -; CHECK-NEXT: .local .align 2 .b8 __local_depot6[8]; -; CHECK-NEXT: .reg .b64 %SP; -; CHECK-NEXT: .reg .b64 %SPL; -; CHECK-NEXT: .reg .b16 %rs<40>; -; CHECK-NEXT: .reg .b32 %r<38>; -; CHECK-NEXT: .reg .b64 %rd<9>; +; CHECK-NEXT: .reg .b32 %r<21>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: mov.u64 %SPL, __local_depot6; -; CHECK-NEXT: cvta.local.u64 %SP, %SPL; -; CHECK-NEXT: ld.param.v4.u8 {%rs1, %rs2, %rs3, %rs4}, [out_v8i8_param_0]; -; CHECK-NEXT: mov.b32 %r1, {%rs3, %rs4}; -; CHECK-NEXT: mov.b32 %r2, {%rs1, %rs2}; -; CHECK-NEXT: ld.param.v4.u8 {%rs5, %rs6, %rs7, %rs8}, [out_v8i8_param_0+4]; -; CHECK-NEXT: mov.b32 %r3, {%rs7, %rs8}; -; CHECK-NEXT: mov.b32 %r4, {%rs5, %rs6}; -; CHECK-NEXT: ld.param.v4.u8 {%rs9, %rs10, %rs11, %rs12}, [out_v8i8_param_2+4]; -; CHECK-NEXT: mov.b32 %r5, {%rs9, %rs10}; -; CHECK-NEXT: and.b32 %r6, %r4, %r5; -; CHECK-NEXT: mov.b32 %r7, {%rs11, %rs12}; -; CHECK-NEXT: and.b32 %r8, %r3, %r7; -; CHECK-NEXT: ld.param.v4.u8 {%rs13, %rs14, %rs15, %rs16}, [out_v8i8_param_2]; -; CHECK-NEXT: mov.b32 %r9, {%rs13, %rs14}; -; CHECK-NEXT: and.b32 %r10, %r2, %r9; -; CHECK-NEXT: mov.b32 %r11, {%rs15, %rs16}; -; CHECK-NEXT: and.b32 %r12, %r1, %r11; -; CHECK-NEXT: ld.param.v4.u8 {%rs17, %rs18, %rs19, %rs20}, [out_v8i8_param_1]; -; CHECK-NEXT: mov.b32 %r13, {%rs19, %rs20}; -; CHECK-NEXT: mov.b32 %r14, {%rs17, %rs18}; -; CHECK-NEXT: ld.param.v4.u8 {%rs21, %rs22, %rs23, %rs24}, [out_v8i8_param_1+4]; -; CHECK-NEXT: mov.b32 %r15, {%rs23, %rs24}; -; CHECK-NEXT: mov.b32 %r16, {%rs21, %rs22}; -; CHECK-NEXT: xor.b32 %r17, %r11, 16711935; -; CHECK-NEXT: xor.b32 %r18, %r9, 16711935; -; CHECK-NEXT: xor.b32 %r19, %r7, 16711935; -; CHECK-NEXT: xor.b32 %r20, %r5, 16711935; -; CHECK-NEXT: and.b32 %r21, %r16, %r20; -; CHECK-NEXT: and.b32 %r22, %r15, %r19; -; CHECK-NEXT: and.b32 %r23, %r14, %r18; -; CHECK-NEXT: and.b32 %r24, %r13, %r17; -; CHECK-NEXT: or.b32 %r25, %r12, %r24; -; CHECK-NEXT: mov.b32 {%rs25, %rs26}, %r25; -; CHECK-NEXT: st.v2.u8 [%SP+0], {%rs25, %rs26}; -; CHECK-NEXT: or.b32 %r26, %r10, %r23; -; CHECK-NEXT: mov.b32 {%rs27, %rs28}, %r26; -; CHECK-NEXT: st.v2.u8 [%SP+2], {%rs27, %rs28}; -; CHECK-NEXT: or.b32 %r27, %r8, %r22; -; CHECK-NEXT: mov.b32 {%rs29, %rs30}, %r27; -; CHECK-NEXT: st.v2.u8 [%SP+4], {%rs29, %rs30}; -; CHECK-NEXT: or.b32 %r28, %r6, %r21; -; CHECK-NEXT: mov.b32 {%rs31, %rs32}, %r28; -; CHECK-NEXT: st.v2.u8 [%SP+6], {%rs31, %rs32}; -; CHECK-NEXT: ld.u16 %r29, [%SP+0]; -; CHECK-NEXT: shl.b32 %r30, %r29, 16; -; CHECK-NEXT: ld.u16 %r31, [%SP+2]; -; CHECK-NEXT: or.b32 %r32, %r31, %r30; -; CHECK-NEXT: cvt.u64.u32 %rd1, %r32; -; CHECK-NEXT: ld.u16 %r33, [%SP+4]; -; CHECK-NEXT: shl.b32 %r34, %r33, 16; -; CHECK-NEXT: ld.u16 %r35, [%SP+6]; -; CHECK-NEXT: or.b32 %r36, %r35, %r34; -; CHECK-NEXT: cvt.u64.u32 %rd2, %r36; -; CHECK-NEXT: shl.b64 %rd3, %rd2, 32; -; CHECK-NEXT: or.b64 %rd4, %rd1, %rd3; -; CHECK-NEXT: shr.u32 %r37, %r36, 8; -; CHECK-NEXT: shr.u64 %rd5, %rd4, 24; -; CHECK-NEXT: cvt.u16.u64 %rs33, %rd5; -; CHECK-NEXT: shr.u64 %rd6, %rd1, 16; -; CHECK-NEXT: cvt.u16.u64 %rs34, %rd6; -; CHECK-NEXT: shr.u64 %rd7, %rd1, 8; -; CHECK-NEXT: cvt.u16.u64 %rs35, %rd7; -; CHECK-NEXT: st.param.v4.b8 [func_retval0+0], {%rs27, %rs35, %rs34, %rs33}; -; CHECK-NEXT: cvt.u16.u32 %rs36, %r37; -; CHECK-NEXT: bfe.s64 %rd8, %rd2, 24, 8; -; CHECK-NEXT: cvt.u16.u64 %rs37, %rd8; -; CHECK-NEXT: cvt.u16.u32 %rs38, %r33; -; CHECK-NEXT: cvt.u16.u32 %rs39, %r35; -; CHECK-NEXT: st.param.v4.b8 [func_retval0+4], {%rs39, %rs36, %rs38, %rs37}; +; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [out_v8i8_param_1]; +; CHECK-NEXT: ld.param.v2.u32 {%r5, %r6}, [out_v8i8_param_2]; +; CHECK-NEXT: ld.param.v2.u32 {%r9, %r10}, [out_v8i8_param_0]; +; CHECK-NEXT: and.b32 %r11, %r9, %r5; +; CHECK-NEXT: and.b32 %r13, %r10, %r6; +; CHECK-NEXT: xor.b32 %r15, %r6, -1; +; CHECK-NEXT: xor.b32 %r16, %r5, -1; +; CHECK-NEXT: and.b32 %r17, %r1, %r16; +; CHECK-NEXT: and.b32 %r18, %r2, %r15; +; CHECK-NEXT: or.b32 %r19, %r13, %r18; +; CHECK-NEXT: or.b32 %r20, %r11, %r17; +; CHECK-NEXT: st.param.v2.b32 [func_retval0+0], {%r20, %r19}; ; CHECK-NEXT: ret; %mx = and <8 x i8> %x, %mask %notmask = xor <8 x i8> %mask, @@ -327,8 +204,7 @@ define <4 x i16> @out_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwin define <4 x i16> @out_v4i16_undef(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind { ; CHECK-LABEL: out_v4i16_undef( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<3>; -; CHECK-NEXT: .reg .b32 %r<22>; +; CHECK-NEXT: .reg .b32 %r<21>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [out_v4i16_undef_param_1]; @@ -336,15 +212,13 @@ define <4 x i16> @out_v4i16_undef(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) n ; CHECK-NEXT: ld.param.v2.u32 {%r9, %r10}, [out_v4i16_undef_param_0]; ; CHECK-NEXT: and.b32 %r11, %r9, %r5; ; CHECK-NEXT: and.b32 %r13, %r10, %r6; -; CHECK-NEXT: mov.u16 %rs1, -1; -; CHECK-NEXT: mov.b32 %r15, {%rs2, %rs1}; -; CHECK-NEXT: xor.b32 %r16, %r6, %r15; -; CHECK-NEXT: xor.b32 %r17, %r5, -1; -; CHECK-NEXT: and.b32 %r18, %r1, %r17; -; CHECK-NEXT: and.b32 %r19, %r2, %r16; -; CHECK-NEXT: or.b32 %r20, %r13, %r19; -; CHECK-NEXT: or.b32 %r21, %r11, %r18; -; CHECK-NEXT: st.param.v2.b32 [func_retval0+0], {%r21, %r20}; +; CHECK-NEXT: xor.b32 %r15, %r6, -65536; +; CHECK-NEXT: xor.b32 %r16, %r5, -1; +; CHECK-NEXT: and.b32 %r17, %r1, %r16; +; CHECK-NEXT: and.b32 %r18, %r2, %r15; +; CHECK-NEXT: or.b32 %r19, %r13, %r18; +; CHECK-NEXT: or.b32 %r20, %r11, %r17; +; CHECK-NEXT: st.param.v2.b32 [func_retval0+0], {%r20, %r19}; ; CHECK-NEXT: ret; %mx = and <4 x i16> %x, %mask %notmask = xor <4 x i16> %mask, @@ -408,90 +282,29 @@ define <1 x i64> @out_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwin define <16 x i8> @out_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind { ; CHECK-LABEL: out_v16i8( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<65>; -; CHECK-NEXT: .reg .b32 %r<57>; +; CHECK-NEXT: .reg .b32 %r<41>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.u8 {%rs1, %rs2, %rs3, %rs4}, [out_v16i8_param_0+12]; -; CHECK-NEXT: mov.b32 %r1, {%rs1, %rs2}; -; CHECK-NEXT: mov.b32 %r2, {%rs3, %rs4}; -; CHECK-NEXT: ld.param.v4.u8 {%rs5, %rs6, %rs7, %rs8}, [out_v16i8_param_0+8]; -; CHECK-NEXT: mov.b32 %r3, {%rs5, %rs6}; -; CHECK-NEXT: mov.b32 %r4, {%rs7, %rs8}; -; CHECK-NEXT: ld.param.v4.u8 {%rs9, %rs10, %rs11, %rs12}, [out_v16i8_param_0+4]; -; CHECK-NEXT: mov.b32 %r5, {%rs9, %rs10}; -; CHECK-NEXT: mov.b32 %r6, {%rs11, %rs12}; -; CHECK-NEXT: ld.param.v4.u8 {%rs13, %rs14, %rs15, %rs16}, [out_v16i8_param_0]; -; CHECK-NEXT: mov.b32 %r7, {%rs13, %rs14}; -; CHECK-NEXT: mov.b32 %r8, {%rs15, %rs16}; -; CHECK-NEXT: ld.param.v4.u8 {%rs17, %rs18, %rs19, %rs20}, [out_v16i8_param_2]; -; CHECK-NEXT: mov.b32 %r9, {%rs19, %rs20}; -; CHECK-NEXT: and.b32 %r10, %r8, %r9; -; CHECK-NEXT: mov.b32 %r11, {%rs17, %rs18}; -; CHECK-NEXT: and.b32 %r12, %r7, %r11; -; CHECK-NEXT: ld.param.v4.u8 {%rs21, %rs22, %rs23, %rs24}, [out_v16i8_param_2+4]; -; CHECK-NEXT: mov.b32 %r13, {%rs23, %rs24}; -; CHECK-NEXT: and.b32 %r14, %r6, %r13; -; CHECK-NEXT: mov.b32 %r15, {%rs21, %rs22}; -; CHECK-NEXT: and.b32 %r16, %r5, %r15; -; CHECK-NEXT: ld.param.v4.u8 {%rs25, %rs26, %rs27, %rs28}, [out_v16i8_param_2+8]; -; CHECK-NEXT: mov.b32 %r17, {%rs27, %rs28}; -; CHECK-NEXT: and.b32 %r18, %r4, %r17; -; CHECK-NEXT: mov.b32 %r19, {%rs25, %rs26}; -; CHECK-NEXT: and.b32 %r20, %r3, %r19; -; CHECK-NEXT: ld.param.v4.u8 {%rs29, %rs30, %rs31, %rs32}, [out_v16i8_param_2+12]; -; CHECK-NEXT: mov.b32 %r21, {%rs31, %rs32}; -; CHECK-NEXT: and.b32 %r22, %r2, %r21; -; CHECK-NEXT: mov.b32 %r23, {%rs29, %rs30}; -; CHECK-NEXT: and.b32 %r24, %r1, %r23; -; CHECK-NEXT: ld.param.v4.u8 {%rs33, %rs34, %rs35, %rs36}, [out_v16i8_param_1+12]; -; CHECK-NEXT: mov.b32 %r25, {%rs33, %rs34}; -; CHECK-NEXT: mov.b32 %r26, {%rs35, %rs36}; -; CHECK-NEXT: ld.param.v4.u8 {%rs37, %rs38, %rs39, %rs40}, [out_v16i8_param_1+8]; -; CHECK-NEXT: mov.b32 %r27, {%rs37, %rs38}; -; CHECK-NEXT: mov.b32 %r28, {%rs39, %rs40}; -; CHECK-NEXT: ld.param.v4.u8 {%rs41, %rs42, %rs43, %rs44}, [out_v16i8_param_1+4]; -; CHECK-NEXT: mov.b32 %r29, {%rs41, %rs42}; -; CHECK-NEXT: mov.b32 %r30, {%rs43, %rs44}; -; CHECK-NEXT: ld.param.v4.u8 {%rs45, %rs46, %rs47, %rs48}, [out_v16i8_param_1]; -; CHECK-NEXT: mov.b32 %r31, {%rs45, %rs46}; -; CHECK-NEXT: mov.b32 %r32, {%rs47, %rs48}; -; CHECK-NEXT: xor.b32 %r33, %r23, 16711935; -; CHECK-NEXT: xor.b32 %r34, %r21, 16711935; -; CHECK-NEXT: xor.b32 %r35, %r19, 16711935; -; CHECK-NEXT: xor.b32 %r36, %r17, 16711935; -; CHECK-NEXT: xor.b32 %r37, %r15, 16711935; -; CHECK-NEXT: xor.b32 %r38, %r13, 16711935; -; CHECK-NEXT: xor.b32 %r39, %r11, 16711935; -; CHECK-NEXT: xor.b32 %r40, %r9, 16711935; -; CHECK-NEXT: and.b32 %r41, %r32, %r40; -; CHECK-NEXT: and.b32 %r42, %r31, %r39; -; CHECK-NEXT: and.b32 %r43, %r30, %r38; -; CHECK-NEXT: and.b32 %r44, %r29, %r37; -; CHECK-NEXT: and.b32 %r45, %r28, %r36; -; CHECK-NEXT: and.b32 %r46, %r27, %r35; -; CHECK-NEXT: and.b32 %r47, %r26, %r34; -; CHECK-NEXT: and.b32 %r48, %r25, %r33; -; CHECK-NEXT: or.b32 %r49, %r24, %r48; -; CHECK-NEXT: or.b32 %r50, %r22, %r47; -; CHECK-NEXT: or.b32 %r51, %r20, %r46; -; CHECK-NEXT: or.b32 %r52, %r18, %r45; -; CHECK-NEXT: or.b32 %r53, %r16, %r44; -; CHECK-NEXT: or.b32 %r54, %r14, %r43; -; CHECK-NEXT: or.b32 %r55, %r12, %r42; -; CHECK-NEXT: or.b32 %r56, %r10, %r41; -; CHECK-NEXT: mov.b32 {%rs49, %rs50}, %r56; -; CHECK-NEXT: mov.b32 {%rs51, %rs52}, %r55; -; CHECK-NEXT: st.param.v4.b8 [func_retval0+0], {%rs51, %rs52, %rs49, %rs50}; -; CHECK-NEXT: mov.b32 {%rs53, %rs54}, %r54; -; CHECK-NEXT: mov.b32 {%rs55, %rs56}, %r53; -; CHECK-NEXT: st.param.v4.b8 [func_retval0+4], {%rs55, %rs56, %rs53, %rs54}; -; CHECK-NEXT: mov.b32 {%rs57, %rs58}, %r52; -; CHECK-NEXT: mov.b32 {%rs59, %rs60}, %r51; -; CHECK-NEXT: st.param.v4.b8 [func_retval0+8], {%rs59, %rs60, %rs57, %rs58}; -; CHECK-NEXT: mov.b32 {%rs61, %rs62}, %r50; -; CHECK-NEXT: mov.b32 {%rs63, %rs64}, %r49; -; CHECK-NEXT: st.param.v4.b8 [func_retval0+12], {%rs63, %rs64, %rs61, %rs62}; +; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [out_v16i8_param_1]; +; CHECK-NEXT: ld.param.v4.u32 {%r9, %r10, %r11, %r12}, [out_v16i8_param_2]; +; CHECK-NEXT: ld.param.v4.u32 {%r17, %r18, %r19, %r20}, [out_v16i8_param_0]; +; CHECK-NEXT: and.b32 %r21, %r17, %r9; +; CHECK-NEXT: and.b32 %r23, %r18, %r10; +; CHECK-NEXT: and.b32 %r25, %r19, %r11; +; CHECK-NEXT: and.b32 %r27, %r20, %r12; +; CHECK-NEXT: xor.b32 %r29, %r12, -1; +; CHECK-NEXT: xor.b32 %r30, %r11, -1; +; CHECK-NEXT: xor.b32 %r31, %r10, -1; +; CHECK-NEXT: xor.b32 %r32, %r9, -1; +; CHECK-NEXT: and.b32 %r33, %r1, %r32; +; CHECK-NEXT: and.b32 %r34, %r2, %r31; +; CHECK-NEXT: and.b32 %r35, %r3, %r30; +; CHECK-NEXT: and.b32 %r36, %r4, %r29; +; CHECK-NEXT: or.b32 %r37, %r27, %r36; +; CHECK-NEXT: or.b32 %r38, %r25, %r35; +; CHECK-NEXT: or.b32 %r39, %r23, %r34; +; CHECK-NEXT: or.b32 %r40, %r21, %r33; +; CHECK-NEXT: st.param.v4.b32 [func_retval0+0], {%r40, %r39, %r38, %r37}; ; CHECK-NEXT: ret; %mx = and <16 x i8> %x, %mask %notmask = xor <16 x i8> %mask, @@ -684,44 +497,16 @@ define <1 x i16> @in_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwind define <4 x i8> @in_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind { ; CHECK-LABEL: in_v4i8( ; CHECK: { -; CHECK-NEXT: .local .align 2 .b8 __local_depot18[4]; -; CHECK-NEXT: .reg .b64 %SP; -; CHECK-NEXT: .reg .b64 %SPL; -; CHECK-NEXT: .reg .b16 %rs<20>; -; CHECK-NEXT: .reg .b32 %r<19>; +; CHECK-NEXT: .reg .b32 %r<8>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: mov.u64 %SPL, __local_depot18; -; CHECK-NEXT: cvta.local.u64 %SP, %SPL; -; CHECK-NEXT: ld.param.v4.u8 {%rs1, %rs2, %rs3, %rs4}, [in_v4i8_param_0]; -; CHECK-NEXT: mov.b32 %r1, {%rs1, %rs2}; -; CHECK-NEXT: mov.b32 %r2, {%rs3, %rs4}; -; CHECK-NEXT: ld.param.v4.u8 {%rs5, %rs6, %rs7, %rs8}, [in_v4i8_param_1]; -; CHECK-NEXT: mov.b32 %r3, {%rs7, %rs8}; -; CHECK-NEXT: xor.b32 %r4, %r2, %r3; -; CHECK-NEXT: mov.b32 %r5, {%rs5, %rs6}; -; CHECK-NEXT: xor.b32 %r6, %r1, %r5; -; CHECK-NEXT: ld.param.v4.u8 {%rs9, %rs10, %rs11, %rs12}, [in_v4i8_param_2]; -; CHECK-NEXT: mov.b32 %r7, {%rs9, %rs10}; -; CHECK-NEXT: and.b32 %r8, %r6, %r7; -; CHECK-NEXT: mov.b32 %r9, {%rs11, %rs12}; -; CHECK-NEXT: and.b32 %r10, %r4, %r9; -; CHECK-NEXT: xor.b32 %r11, %r10, %r3; -; CHECK-NEXT: mov.b32 {%rs13, %rs14}, %r11; -; CHECK-NEXT: st.v2.u8 [%SP+0], {%rs13, %rs14}; -; CHECK-NEXT: xor.b32 %r12, %r8, %r5; -; CHECK-NEXT: mov.b32 {%rs15, %rs16}, %r12; -; CHECK-NEXT: st.v2.u8 [%SP+2], {%rs15, %rs16}; -; CHECK-NEXT: ld.u16 %r13, [%SP+0]; -; CHECK-NEXT: shl.b32 %r14, %r13, 16; -; CHECK-NEXT: ld.u16 %r15, [%SP+2]; -; CHECK-NEXT: or.b32 %r16, %r15, %r14; -; CHECK-NEXT: shr.u32 %r17, %r16, 8; -; CHECK-NEXT: cvt.u16.u32 %rs17, %r17; -; CHECK-NEXT: cvt.u16.u32 %rs18, %r13; -; CHECK-NEXT: bfe.s32 %r18, %r13, 8, 8; -; CHECK-NEXT: cvt.u16.u32 %rs19, %r18; -; CHECK-NEXT: st.param.v4.b8 [func_retval0+0], {%rs15, %rs17, %rs18, %rs19}; +; CHECK-NEXT: ld.param.u32 %r1, [in_v4i8_param_0]; +; CHECK-NEXT: ld.param.u32 %r2, [in_v4i8_param_1]; +; CHECK-NEXT: xor.b32 %r3, %r1, %r2; +; CHECK-NEXT: ld.param.u32 %r4, [in_v4i8_param_2]; +; CHECK-NEXT: and.b32 %r5, %r3, %r4; +; CHECK-NEXT: xor.b32 %r6, %r5, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r6; ; CHECK-NEXT: ret; %n0 = xor <4 x i8> %x, %y %n1 = and <4 x i8> %n0, %mask @@ -776,80 +561,19 @@ define <1 x i32> @in_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwind define <8 x i8> @in_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind { ; CHECK-LABEL: in_v8i8( ; CHECK: { -; CHECK-NEXT: .local .align 2 .b8 __local_depot21[8]; -; CHECK-NEXT: .reg .b64 %SP; -; CHECK-NEXT: .reg .b64 %SPL; -; CHECK-NEXT: .reg .b16 %rs<40>; -; CHECK-NEXT: .reg .b32 %r<34>; -; CHECK-NEXT: .reg .b64 %rd<9>; +; CHECK-NEXT: .reg .b32 %r<15>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: mov.u64 %SPL, __local_depot21; -; CHECK-NEXT: cvta.local.u64 %SP, %SPL; -; CHECK-NEXT: ld.param.v4.u8 {%rs1, %rs2, %rs3, %rs4}, [in_v8i8_param_0+4]; -; CHECK-NEXT: mov.b32 %r1, {%rs1, %rs2}; -; CHECK-NEXT: mov.b32 %r2, {%rs3, %rs4}; -; CHECK-NEXT: ld.param.v4.u8 {%rs5, %rs6, %rs7, %rs8}, [in_v8i8_param_0]; -; CHECK-NEXT: mov.b32 %r3, {%rs5, %rs6}; -; CHECK-NEXT: mov.b32 %r4, {%rs7, %rs8}; -; CHECK-NEXT: ld.param.v4.u8 {%rs9, %rs10, %rs11, %rs12}, [in_v8i8_param_1]; -; CHECK-NEXT: mov.b32 %r5, {%rs11, %rs12}; -; CHECK-NEXT: xor.b32 %r6, %r4, %r5; -; CHECK-NEXT: mov.b32 %r7, {%rs9, %rs10}; -; CHECK-NEXT: xor.b32 %r8, %r3, %r7; -; CHECK-NEXT: ld.param.v4.u8 {%rs13, %rs14, %rs15, %rs16}, [in_v8i8_param_1+4]; -; CHECK-NEXT: mov.b32 %r9, {%rs15, %rs16}; -; CHECK-NEXT: xor.b32 %r10, %r2, %r9; -; CHECK-NEXT: mov.b32 %r11, {%rs13, %rs14}; -; CHECK-NEXT: xor.b32 %r12, %r1, %r11; -; CHECK-NEXT: ld.param.v4.u8 {%rs17, %rs18, %rs19, %rs20}, [in_v8i8_param_2+4]; -; CHECK-NEXT: mov.b32 %r13, {%rs17, %rs18}; -; CHECK-NEXT: and.b32 %r14, %r12, %r13; -; CHECK-NEXT: mov.b32 %r15, {%rs19, %rs20}; -; CHECK-NEXT: and.b32 %r16, %r10, %r15; -; CHECK-NEXT: ld.param.v4.u8 {%rs21, %rs22, %rs23, %rs24}, [in_v8i8_param_2]; -; CHECK-NEXT: mov.b32 %r17, {%rs21, %rs22}; -; CHECK-NEXT: and.b32 %r18, %r8, %r17; -; CHECK-NEXT: mov.b32 %r19, {%rs23, %rs24}; -; CHECK-NEXT: and.b32 %r20, %r6, %r19; -; CHECK-NEXT: xor.b32 %r21, %r20, %r5; -; CHECK-NEXT: mov.b32 {%rs25, %rs26}, %r21; -; CHECK-NEXT: st.v2.u8 [%SP+0], {%rs25, %rs26}; -; CHECK-NEXT: xor.b32 %r22, %r18, %r7; -; CHECK-NEXT: mov.b32 {%rs27, %rs28}, %r22; -; CHECK-NEXT: st.v2.u8 [%SP+2], {%rs27, %rs28}; -; CHECK-NEXT: xor.b32 %r23, %r16, %r9; -; CHECK-NEXT: mov.b32 {%rs29, %rs30}, %r23; -; CHECK-NEXT: st.v2.u8 [%SP+4], {%rs29, %rs30}; -; CHECK-NEXT: xor.b32 %r24, %r14, %r11; -; CHECK-NEXT: mov.b32 {%rs31, %rs32}, %r24; -; CHECK-NEXT: st.v2.u8 [%SP+6], {%rs31, %rs32}; -; CHECK-NEXT: ld.u16 %r25, [%SP+0]; -; CHECK-NEXT: shl.b32 %r26, %r25, 16; -; CHECK-NEXT: ld.u16 %r27, [%SP+2]; -; CHECK-NEXT: or.b32 %r28, %r27, %r26; -; CHECK-NEXT: cvt.u64.u32 %rd1, %r28; -; CHECK-NEXT: ld.u16 %r29, [%SP+4]; -; CHECK-NEXT: shl.b32 %r30, %r29, 16; -; CHECK-NEXT: ld.u16 %r31, [%SP+6]; -; CHECK-NEXT: or.b32 %r32, %r31, %r30; -; CHECK-NEXT: cvt.u64.u32 %rd2, %r32; -; CHECK-NEXT: shl.b64 %rd3, %rd2, 32; -; CHECK-NEXT: or.b64 %rd4, %rd1, %rd3; -; CHECK-NEXT: shr.u32 %r33, %r32, 8; -; CHECK-NEXT: shr.u64 %rd5, %rd4, 24; -; CHECK-NEXT: cvt.u16.u64 %rs33, %rd5; -; CHECK-NEXT: shr.u64 %rd6, %rd1, 16; -; CHECK-NEXT: cvt.u16.u64 %rs34, %rd6; -; CHECK-NEXT: shr.u64 %rd7, %rd1, 8; -; CHECK-NEXT: cvt.u16.u64 %rs35, %rd7; -; CHECK-NEXT: st.param.v4.b8 [func_retval0+0], {%rs27, %rs35, %rs34, %rs33}; -; CHECK-NEXT: cvt.u16.u32 %rs36, %r33; -; CHECK-NEXT: bfe.s64 %rd8, %rd2, 24, 8; -; CHECK-NEXT: cvt.u16.u64 %rs37, %rd8; -; CHECK-NEXT: cvt.u16.u32 %rs38, %r29; -; CHECK-NEXT: cvt.u16.u32 %rs39, %r31; -; CHECK-NEXT: st.param.v4.b8 [func_retval0+4], {%rs39, %rs36, %rs38, %rs37}; +; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [in_v8i8_param_0]; +; CHECK-NEXT: ld.param.v2.u32 {%r3, %r4}, [in_v8i8_param_1]; +; CHECK-NEXT: ld.param.v2.u32 {%r5, %r6}, [in_v8i8_param_2]; +; CHECK-NEXT: xor.b32 %r7, %r2, %r4; +; CHECK-NEXT: and.b32 %r8, %r7, %r6; +; CHECK-NEXT: xor.b32 %r9, %r8, %r4; +; CHECK-NEXT: xor.b32 %r11, %r1, %r3; +; CHECK-NEXT: and.b32 %r12, %r11, %r5; +; CHECK-NEXT: xor.b32 %r13, %r12, %r3; +; CHECK-NEXT: st.param.v2.b32 [func_retval0+0], {%r13, %r9}; ; CHECK-NEXT: ret; %n0 = xor <8 x i8> %x, %y %n1 = and <8 x i8> %n0, %mask @@ -930,82 +654,25 @@ define <1 x i64> @in_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwind define <16 x i8> @in_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind { ; CHECK-LABEL: in_v16i8( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<65>; -; CHECK-NEXT: .reg .b32 %r<49>; +; CHECK-NEXT: .reg .b32 %r<29>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.u8 {%rs1, %rs2, %rs3, %rs4}, [in_v16i8_param_0]; -; CHECK-NEXT: mov.b32 %r1, {%rs3, %rs4}; -; CHECK-NEXT: mov.b32 %r2, {%rs1, %rs2}; -; CHECK-NEXT: ld.param.v4.u8 {%rs5, %rs6, %rs7, %rs8}, [in_v16i8_param_0+4]; -; CHECK-NEXT: mov.b32 %r3, {%rs7, %rs8}; -; CHECK-NEXT: mov.b32 %r4, {%rs5, %rs6}; -; CHECK-NEXT: ld.param.v4.u8 {%rs9, %rs10, %rs11, %rs12}, [in_v16i8_param_0+8]; -; CHECK-NEXT: mov.b32 %r5, {%rs11, %rs12}; -; CHECK-NEXT: mov.b32 %r6, {%rs9, %rs10}; -; CHECK-NEXT: ld.param.v4.u8 {%rs13, %rs14, %rs15, %rs16}, [in_v16i8_param_0+12]; -; CHECK-NEXT: mov.b32 %r7, {%rs15, %rs16}; -; CHECK-NEXT: mov.b32 %r8, {%rs13, %rs14}; -; CHECK-NEXT: ld.param.v4.u8 {%rs17, %rs18, %rs19, %rs20}, [in_v16i8_param_1+12]; -; CHECK-NEXT: mov.b32 %r9, {%rs17, %rs18}; -; CHECK-NEXT: xor.b32 %r10, %r8, %r9; -; CHECK-NEXT: mov.b32 %r11, {%rs19, %rs20}; -; CHECK-NEXT: xor.b32 %r12, %r7, %r11; -; CHECK-NEXT: ld.param.v4.u8 {%rs21, %rs22, %rs23, %rs24}, [in_v16i8_param_1+8]; -; CHECK-NEXT: mov.b32 %r13, {%rs21, %rs22}; -; CHECK-NEXT: xor.b32 %r14, %r6, %r13; -; CHECK-NEXT: mov.b32 %r15, {%rs23, %rs24}; -; CHECK-NEXT: xor.b32 %r16, %r5, %r15; -; CHECK-NEXT: ld.param.v4.u8 {%rs25, %rs26, %rs27, %rs28}, [in_v16i8_param_1+4]; -; CHECK-NEXT: mov.b32 %r17, {%rs25, %rs26}; -; CHECK-NEXT: xor.b32 %r18, %r4, %r17; -; CHECK-NEXT: mov.b32 %r19, {%rs27, %rs28}; -; CHECK-NEXT: xor.b32 %r20, %r3, %r19; -; CHECK-NEXT: ld.param.v4.u8 {%rs29, %rs30, %rs31, %rs32}, [in_v16i8_param_1]; -; CHECK-NEXT: mov.b32 %r21, {%rs29, %rs30}; -; CHECK-NEXT: xor.b32 %r22, %r2, %r21; -; CHECK-NEXT: mov.b32 %r23, {%rs31, %rs32}; -; CHECK-NEXT: xor.b32 %r24, %r1, %r23; -; CHECK-NEXT: ld.param.v4.u8 {%rs33, %rs34, %rs35, %rs36}, [in_v16i8_param_2]; -; CHECK-NEXT: mov.b32 %r25, {%rs35, %rs36}; -; CHECK-NEXT: and.b32 %r26, %r24, %r25; -; CHECK-NEXT: mov.b32 %r27, {%rs33, %rs34}; -; CHECK-NEXT: and.b32 %r28, %r22, %r27; -; CHECK-NEXT: ld.param.v4.u8 {%rs37, %rs38, %rs39, %rs40}, [in_v16i8_param_2+4]; -; CHECK-NEXT: mov.b32 %r29, {%rs39, %rs40}; -; CHECK-NEXT: and.b32 %r30, %r20, %r29; -; CHECK-NEXT: mov.b32 %r31, {%rs37, %rs38}; -; CHECK-NEXT: and.b32 %r32, %r18, %r31; -; CHECK-NEXT: ld.param.v4.u8 {%rs41, %rs42, %rs43, %rs44}, [in_v16i8_param_2+8]; -; CHECK-NEXT: mov.b32 %r33, {%rs43, %rs44}; -; CHECK-NEXT: and.b32 %r34, %r16, %r33; -; CHECK-NEXT: mov.b32 %r35, {%rs41, %rs42}; -; CHECK-NEXT: and.b32 %r36, %r14, %r35; -; CHECK-NEXT: ld.param.v4.u8 {%rs45, %rs46, %rs47, %rs48}, [in_v16i8_param_2+12]; -; CHECK-NEXT: mov.b32 %r37, {%rs47, %rs48}; -; CHECK-NEXT: and.b32 %r38, %r12, %r37; -; CHECK-NEXT: mov.b32 %r39, {%rs45, %rs46}; -; CHECK-NEXT: and.b32 %r40, %r10, %r39; -; CHECK-NEXT: xor.b32 %r41, %r40, %r9; -; CHECK-NEXT: xor.b32 %r42, %r38, %r11; -; CHECK-NEXT: xor.b32 %r43, %r36, %r13; -; CHECK-NEXT: xor.b32 %r44, %r34, %r15; -; CHECK-NEXT: xor.b32 %r45, %r32, %r17; -; CHECK-NEXT: xor.b32 %r46, %r30, %r19; -; CHECK-NEXT: xor.b32 %r47, %r28, %r21; -; CHECK-NEXT: xor.b32 %r48, %r26, %r23; -; CHECK-NEXT: mov.b32 {%rs49, %rs50}, %r48; -; CHECK-NEXT: mov.b32 {%rs51, %rs52}, %r47; -; CHECK-NEXT: st.param.v4.b8 [func_retval0+0], {%rs51, %rs52, %rs49, %rs50}; -; CHECK-NEXT: mov.b32 {%rs53, %rs54}, %r46; -; CHECK-NEXT: mov.b32 {%rs55, %rs56}, %r45; -; CHECK-NEXT: st.param.v4.b8 [func_retval0+4], {%rs55, %rs56, %rs53, %rs54}; -; CHECK-NEXT: mov.b32 {%rs57, %rs58}, %r44; -; CHECK-NEXT: mov.b32 {%rs59, %rs60}, %r43; -; CHECK-NEXT: st.param.v4.b8 [func_retval0+8], {%rs59, %rs60, %rs57, %rs58}; -; CHECK-NEXT: mov.b32 {%rs61, %rs62}, %r42; -; CHECK-NEXT: mov.b32 {%rs63, %rs64}, %r41; -; CHECK-NEXT: st.param.v4.b8 [func_retval0+12], {%rs63, %rs64, %rs61, %rs62}; +; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [in_v16i8_param_0]; +; CHECK-NEXT: ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [in_v16i8_param_1]; +; CHECK-NEXT: xor.b32 %r9, %r4, %r8; +; CHECK-NEXT: xor.b32 %r10, %r3, %r7; +; CHECK-NEXT: xor.b32 %r11, %r2, %r6; +; CHECK-NEXT: xor.b32 %r12, %r1, %r5; +; CHECK-NEXT: ld.param.v4.u32 {%r13, %r14, %r15, %r16}, [in_v16i8_param_2]; +; CHECK-NEXT: and.b32 %r17, %r12, %r13; +; CHECK-NEXT: and.b32 %r18, %r11, %r14; +; CHECK-NEXT: and.b32 %r19, %r10, %r15; +; CHECK-NEXT: and.b32 %r20, %r9, %r16; +; CHECK-NEXT: xor.b32 %r21, %r20, %r8; +; CHECK-NEXT: xor.b32 %r23, %r19, %r7; +; CHECK-NEXT: xor.b32 %r25, %r18, %r6; +; CHECK-NEXT: xor.b32 %r27, %r17, %r5; +; CHECK-NEXT: st.param.v4.b32 [func_retval0+0], {%r27, %r25, %r23, %r21}; ; CHECK-NEXT: ret; %n0 = xor <16 x i8> %x, %y %n1 = and <16 x i8> %n0, %mask diff --git a/llvm/test/CodeGen/NVPTX/vec8.ll b/llvm/test/CodeGen/NVPTX/vec8.ll index 092607462f332..8333a9b935d6a 100644 --- a/llvm/test/CodeGen/NVPTX/vec8.ll +++ b/llvm/test/CodeGen/NVPTX/vec8.ll @@ -5,10 +5,9 @@ target triple = "nvptx-unknown-cuda" ; CHECK: .visible .func foo define void @foo(<8 x i8> %a, ptr %b) { -; CHECK-DAG: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [foo_param_0] -; CHECK-DAG: ld.param.v4.u8 {[[E4:%rs[0-9]+]], [[E5:%rs[0-9]+]], [[E6:%rs[0-9]+]], [[E7:%rs[0-9]+]]}, [foo_param_0+4] +; CHECK-DAG: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [foo_param_0] ; CHECK-DAG: ld.param.u64 %[[B:rd[0-9+]]], [foo_param_1] -; CHECK: add.s16 [[T:%rs[0-9+]]], [[E1]], [[E6]]; +; CHECK: add.s16 [[T:%rs[0-9+]]], ; CHECK: st.u8 [%[[B]]], [[T]]; %t0 = extractelement <8 x i8> %a, i32 1 %t1 = extractelement <8 x i8> %a, i32 6