Skip to content

Commit db19330

Browse files
committed
[AMDGPU] Extend permlane16, permlanex16 and permlane64 intrinsic lowering for generic types
1 parent b002711 commit db19330

File tree

11 files changed

+2127
-191
lines changed

11 files changed

+2127
-191
lines changed

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18479,6 +18479,25 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1847918479
CGM.getIntrinsic(Intrinsic::amdgcn_update_dpp, Args[0]->getType());
1848018480
return Builder.CreateCall(F, Args);
1848118481
}
18482+
case AMDGPU::BI__builtin_amdgcn_permlane16:
18483+
case AMDGPU::BI__builtin_amdgcn_permlanex16: {
18484+
Intrinsic::ID IID;
18485+
IID = BuiltinID == AMDGPU::BI__builtin_amdgcn_permlane16
18486+
? Intrinsic::amdgcn_permlane16
18487+
: Intrinsic::amdgcn_permlanex16;
18488+
18489+
llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
18490+
llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
18491+
llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
18492+
llvm::Value *Src3 = EmitScalarExpr(E->getArg(3));
18493+
llvm::Value *Src4 = EmitScalarExpr(E->getArg(4));
18494+
llvm::Value *Src5 = EmitScalarExpr(E->getArg(5));
18495+
18496+
llvm::Function *F = CGM.getIntrinsic(IID, Src1->getType());
18497+
return Builder.CreateCall(F, {Src0, Src1, Src2, Src3, Src4, Src5});
18498+
}
18499+
case AMDGPU::BI__builtin_amdgcn_permlane64:
18500+
return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_permlane64);
1848218501
case AMDGPU::BI__builtin_amdgcn_readlane:
1848318502
return emitBinaryBuiltin(*this, E, Intrinsic::amdgcn_readlane);
1848418503
case AMDGPU::BI__builtin_amdgcn_readfirstlane:

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2488,15 +2488,15 @@ def int_amdgcn_global_load_lds : AMDGPUGlobalLoadLDS;
24882488

24892489
// llvm.amdgcn.permlane16 <old> <src0> <src1> <src2> <fi> <bound_control>
24902490
def int_amdgcn_permlane16 : ClangBuiltin<"__builtin_amdgcn_permlane16">,
2491-
Intrinsic<[llvm_i32_ty],
2492-
[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
2491+
Intrinsic<[llvm_any_ty],
2492+
[LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
24932493
[IntrNoMem, IntrConvergent, IntrWillReturn,
24942494
ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>;
24952495

24962496
// llvm.amdgcn.permlanex16 <old> <src0> <src1> <src2> <fi> <bound_control>
24972497
def int_amdgcn_permlanex16 : ClangBuiltin<"__builtin_amdgcn_permlanex16">,
2498-
Intrinsic<[llvm_i32_ty],
2499-
[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
2498+
Intrinsic<[llvm_any_ty],
2499+
[LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
25002500
[IntrNoMem, IntrConvergent, IntrWillReturn,
25012501
ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>;
25022502

@@ -2540,7 +2540,7 @@ def int_amdgcn_image_bvh_intersect_ray :
25402540
// llvm.amdgcn.permlane64 <src0>
25412541
def int_amdgcn_permlane64 :
25422542
ClangBuiltin<"__builtin_amdgcn_permlane64">,
2543-
Intrinsic<[llvm_i32_ty], [llvm_i32_ty],
2543+
Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
25442544
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
25452545

25462546
def int_amdgcn_ds_add_gs_reg_rtn :

llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -413,7 +413,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
413413
assert(ST->hasPermLaneX16());
414414
V = B.CreateBitCast(V, IntNTy);
415415
Value *Permlanex16Call = B.CreateIntrinsic(
416-
Intrinsic::amdgcn_permlanex16, {},
416+
V->getType(), Intrinsic::amdgcn_permlanex16, {},
417417
{V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});
418418
V = buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy),
419419
B.CreateBitCast(Permlanex16Call, AtomicTy));
@@ -425,7 +425,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
425425
// Reduce across the upper and lower 32 lanes.
426426
V = B.CreateBitCast(V, IntNTy);
427427
Value *Permlane64Call =
428-
B.CreateIntrinsic(Intrinsic::amdgcn_permlane64, {}, V);
428+
B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_permlane64, {}, V);
429429
return buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy),
430430
B.CreateBitCast(Permlane64Call, AtomicTy));
431431
}
@@ -481,7 +481,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
481481
assert(ST->hasPermLaneX16());
482482
V = B.CreateBitCast(V, IntNTy);
483483
Value *PermX = B.CreateIntrinsic(
484-
Intrinsic::amdgcn_permlanex16, {},
484+
V->getType(), Intrinsic::amdgcn_permlanex16, {},
485485
{V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});
486486

487487
Value *UpdateDPPCall =

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5499,6 +5499,9 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
54995499
NODE_NAME_CASE(READLANE)
55005500
NODE_NAME_CASE(READFIRSTLANE)
55015501
NODE_NAME_CASE(WRITELANE)
5502+
NODE_NAME_CASE(PERMLANE16)
5503+
NODE_NAME_CASE(PERMLANEX16)
5504+
NODE_NAME_CASE(PERMLANE64)
55025505
NODE_NAME_CASE(DUMMY_CHAIN)
55035506
case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
55045507
NODE_NAME_CASE(LOAD_D16_HI)

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -561,6 +561,9 @@ enum NodeType : unsigned {
561561
READLANE,
562562
READFIRSTLANE,
563563
WRITELANE,
564+
PERMLANE16,
565+
PERMLANEX16,
566+
PERMLANE64,
564567

565568
DUMMY_CHAIN,
566569
FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,

llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -354,9 +354,21 @@ def AMDGPUDWritelaneOp : SDTypeProfile<1, 3, [
354354
SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisSameAs<0, 3>
355355
]>;
356356

357+
def AMDGPUDPermlane16Op : SDTypeProfile<1, 6, [
358+
SDTCisSameAs<0, 1>, // old
359+
SDTCisSameAs<0, 2>, // src0
360+
SDTCisInt<3>, // src1
361+
SDTCisInt<4>, // src2
362+
SDTCisInt<5>, // i1 fi
363+
SDTCisInt<6> // i1 bound_ctrl
364+
]>;
365+
357366
def AMDGPUreadlane_impl : SDNode<"AMDGPUISD::READLANE", AMDGPUReadlaneOp>;
358367
def AMDGPUreadfirstlane_impl : SDNode<"AMDGPUISD::READFIRSTLANE", AMDGPUReadfirstlaneOp>;
359368
def AMDGPUwritelane_impl : SDNode<"AMDGPUISD::WRITELANE", AMDGPUDWritelaneOp>;
369+
def AMDGPUpermlane16_impl : SDNode<"AMDGPUISD::PERMLANE16", AMDGPUDPermlane16Op>;
370+
def AMDGPUpermlanex16_impl : SDNode<"AMDGPUISD::PERMLANEX16", AMDGPUDPermlane16Op>;
371+
def AMDGPUpermlane64_impl : SDNode<"AMDGPUISD::PERMLANE64", AMDGPUReadfirstlaneOp>;
360372

361373
// SI+ export
362374
def AMDGPUExportOp : SDTypeProfile<0, 8, [
@@ -535,3 +547,16 @@ def AMDGPUwritelane : PatFrags<(ops node:$src0, node:$src1, node:$src2),
535547
[(int_amdgcn_writelane node:$src0, node:$src1, node:$src2),
536548
(AMDGPUwritelane_impl node:$src0, node:$src1, node:$src2)]>;
537549

550+
def AMDGPUpermlane16 : PatFrags<(ops node:$src0, node:$src1, node:$src2, node:$src3, node:$src4, node:$src5),
551+
[(int_amdgcn_permlane16 node:$src0, node:$src1, node:$src2, node:$src3, node:$src4, node:$src5),
552+
(AMDGPUpermlane16_impl node:$src0, node:$src1, node:$src2, node:$src3, node:$src4, node:$src5)]>;
553+
554+
def AMDGPUpermlanex16 : PatFrags<(ops node:$src0, node:$src1, node:$src2, node:$src3, node:$src4, node:$src5),
555+
[(int_amdgcn_permlanex16 node:$src0, node:$src1, node:$src2, node:$src3, node:$src4, node:$src5),
556+
(AMDGPUpermlanex16_impl node:$src0, node:$src1, node:$src2, node:$src3, node:$src4, node:$src5)]>;
557+
558+
def AMDGPUpermlane64 : PatFrags<(ops node:$src),
559+
[(int_amdgcn_permlane64 node:$src),
560+
(AMDGPUpermlane64_impl node:$src)]>;
561+
562+

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 49 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -5397,25 +5397,39 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
53975397
Register DstReg = MI.getOperand(0).getReg();
53985398
Register Src0 = MI.getOperand(2).getReg();
53995399

5400+
bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
5401+
IID == Intrinsic::amdgcn_permlanex16;
5402+
54005403
auto createLaneOp = [&](Register Src0, Register Src1,
54015404
Register Src2) -> Register {
54025405
auto LaneOp = B.buildIntrinsic(IID, {S32}).addUse(Src0);
54035406
switch (IID) {
54045407
case Intrinsic::amdgcn_readfirstlane:
5408+
case Intrinsic::amdgcn_permlane64:
54055409
return LaneOp.getReg(0);
54065410
case Intrinsic::amdgcn_readlane:
54075411
return LaneOp.addUse(Src1).getReg(0);
54085412
case Intrinsic::amdgcn_writelane:
54095413
return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
5414+
case Intrinsic::amdgcn_permlane16:
5415+
case Intrinsic::amdgcn_permlanex16: {
5416+
Register Src3 = MI.getOperand(5).getReg();
5417+
Register Src4 = MI.getOperand(6).getImm();
5418+
Register Src5 = MI.getOperand(7).getImm();
5419+
return LaneOp.addUse(Src1).addUse(Src2).
5420+
addUse(Src3).
5421+
addImm(Src4).
5422+
addImm(Src5).getReg(0);
5423+
}
54105424
default:
54115425
llvm_unreachable("unhandled lane op");
54125426
}
54135427
};
54145428

54155429
Register Src1, Src2;
5416-
if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane) {
5430+
if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
54175431
Src1 = MI.getOperand(3).getReg();
5418-
if (IID == Intrinsic::amdgcn_writelane) {
5432+
if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
54195433
Src2 = MI.getOperand(4).getReg();
54205434
}
54215435
}
@@ -5433,7 +5447,16 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
54335447
? Src0
54345448
: B.buildBitcast(LLT::scalar(Size), Src0).getReg(0);
54355449
Src0 = B.buildAnyExt(S32, Src0Cast).getReg(0);
5436-
if (Src2.isValid()) {
5450+
5451+
if (IsPermLane16) {
5452+
Register Src1Cast =
5453+
MRI.getType(Src1).isScalar()
5454+
? Src1
5455+
: B.buildBitcast(LLT::scalar(Size), Src2).getReg(0);
5456+
Src1 = B.buildAnyExt(LLT::scalar(32), Src1Cast).getReg(0);
5457+
}
5458+
5459+
if (IID == Intrinsic::amdgcn_writelane) {
54375460
Register Src2Cast =
54385461
MRI.getType(Src2).isScalar()
54395462
? Src2
@@ -5485,46 +5508,48 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
54855508
}
54865509
break;
54875510
}
5488-
case Intrinsic::amdgcn_readfirstlane: {
5511+
case Intrinsic::amdgcn_readfirstlane:
5512+
case Intrinsic::amdgcn_permlane64: {
54895513
for (unsigned i = 0; i < NumParts; ++i) {
54905514
Src0 = IsS16Vec ? B.buildBitcast(S32, Src0Parts.getReg(i)).getReg(0)
54915515
: Src0Parts.getReg(i);
54925516
PartialRes.push_back(
5493-
(B.buildIntrinsic(Intrinsic::amdgcn_readfirstlane, {S32})
5517+
(B.buildIntrinsic(IID, {S32})
54945518
.addUse(Src0)
54955519
.getReg(0)));
54965520
}
54975521

54985522
break;
54995523
}
5500-
case Intrinsic::amdgcn_writelane: {
5524+
case Intrinsic::amdgcn_writelane:
5525+
case Intrinsic::amdgcn_permlane16:
5526+
case Intrinsic::amdgcn_permlanex16: {
55015527
Register Src1 = MI.getOperand(3).getReg();
55025528
Register Src2 = MI.getOperand(4).getReg();
5503-
MachineInstrBuilder Src2Parts;
5529+
5530+
Register SrcX = IsPermLane16 ? Src1 : Src2;
5531+
MachineInstrBuilder SrcXParts;
55045532

55055533
if (Ty.isPointer()) {
5506-
auto PtrToInt = B.buildPtrToInt(S64, Src2);
5507-
Src2Parts = B.buildUnmerge(S32, PtrToInt);
5534+
auto PtrToInt = B.buildPtrToInt(S64, SrcX);
5535+
SrcXParts = B.buildUnmerge(S32, PtrToInt);
55085536
} else if (Ty.isPointerVector()) {
55095537
LLT IntVecTy = Ty.changeElementType(
55105538
LLT::scalar(Ty.getElementType().getSizeInBits()));
5511-
auto PtrToInt = B.buildPtrToInt(IntVecTy, Src2);
5512-
Src2Parts = B.buildUnmerge(S32, PtrToInt);
5539+
auto PtrToInt = B.buildPtrToInt(IntVecTy, SrcX);
5540+
SrcXParts = B.buildUnmerge(S32, PtrToInt);
55135541
} else
5514-
Src2Parts =
5515-
IsS16Vec ? B.buildUnmerge(V2S16, Src2) : B.buildUnmerge(S32, Src2);
5542+
SrcXParts =
5543+
IsS16Vec ? B.buildUnmerge(V2S16, SrcX) : B.buildUnmerge(S32, SrcX);
55165544

55175545
for (unsigned i = 0; i < NumParts; ++i) {
55185546
Src0 = IsS16Vec ? B.buildBitcast(S32, Src0Parts.getReg(i)).getReg(0)
55195547
: Src0Parts.getReg(i);
5520-
Src2 = IsS16Vec ? B.buildBitcast(S32, Src2Parts.getReg(i)).getReg(0)
5521-
: Src2Parts.getReg(i);
5522-
PartialRes.push_back(
5523-
(B.buildIntrinsic(Intrinsic::amdgcn_writelane, {S32})
5524-
.addUse(Src0)
5525-
.addUse(Src1)
5526-
.addUse(Src2))
5527-
.getReg(0));
5548+
SrcX = IsS16Vec ? B.buildBitcast(S32, SrcXParts.getReg(i)).getReg(0)
5549+
: SrcXParts.getReg(i);
5550+
PartialRes.push_back( IsPermLane16 ?
5551+
createLaneOp(Src0, SrcX, Src2) :
5552+
createLaneOp(Src0, Src1, SrcX));
55285553
}
55295554

55305555
break;
@@ -7519,6 +7544,9 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
75197544
case Intrinsic::amdgcn_readlane:
75207545
case Intrinsic::amdgcn_writelane:
75217546
case Intrinsic::amdgcn_readfirstlane:
7547+
case Intrinsic::amdgcn_permlane16:
7548+
case Intrinsic::amdgcn_permlanex16:
7549+
case Intrinsic::amdgcn_permlane64:
75227550
return legalizeLaneOp(Helper, MI, IntrID);
75237551
default: {
75247552
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 39 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6091,22 +6091,38 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
60916091
EVT VT = N->getValueType(0);
60926092
unsigned ValSize = VT.getSizeInBits();
60936093
unsigned IntrinsicID = N->getConstantOperandVal(0);
6094+
bool IsPermLane16 = IntrinsicID == Intrinsic::amdgcn_permlane16 ||
6095+
IntrinsicID == Intrinsic::amdgcn_permlanex16;
6096+
bool IsPermLane64 = IntrinsicID == Intrinsic::amdgcn_permlane64;
60946097
SDValue Src0 = N->getOperand(1);
60956098
SDLoc SL(N);
60966099
MVT IntVT = MVT::getIntegerVT(ValSize);
60976100

6098-
auto createLaneOp = [&DAG, &SL](SDValue Src0, SDValue Src1, SDValue Src2,
6099-
MVT VT) -> SDValue {
6100-
return (Src2 ? DAG.getNode(AMDGPUISD::WRITELANE, SL, VT, {Src0, Src1, Src2})
6101-
: Src1 ? DAG.getNode(AMDGPUISD::READLANE, SL, VT, {Src0, Src1})
6102-
: DAG.getNode(AMDGPUISD::READFIRSTLANE, SL, VT, {Src0}));
6101+
auto createLaneOp = [&](SDValue Src0, SDValue Src1, SDValue Src2,
6102+
MVT ValueT) -> SDValue {
6103+
if (IsPermLane16 || IsPermLane64) {
6104+
if (IsPermLane16) {
6105+
SDValue Src3 = N->getOperand(4);
6106+
SDValue Src4 = N->getOperand(5);
6107+
SDValue Src5 = N->getOperand(6);
6108+
return DAG.getNode(IntrinsicID == Intrinsic::amdgcn_permlane16
6109+
? AMDGPUISD::PERMLANE16 : AMDGPUISD::PERMLANEX16,
6110+
SL, ValueT, {Src0, Src1, Src2, Src3, Src4, Src5});
6111+
}
6112+
return DAG.getNode(AMDGPUISD::PERMLANE64, SL, ValueT, {Src0});
6113+
}
6114+
6115+
return (Src2 ? DAG.getNode(AMDGPUISD::WRITELANE, SL, ValueT, {Src0, Src1, Src2})
6116+
: Src1 ? DAG.getNode(AMDGPUISD::READLANE, SL, ValueT, {Src0, Src1})
6117+
: DAG.getNode(AMDGPUISD::READFIRSTLANE, SL, ValueT, {Src0}));
61036118
};
61046119

61056120
SDValue Src1, Src2;
61066121
if (IntrinsicID == Intrinsic::amdgcn_readlane ||
6107-
IntrinsicID == Intrinsic::amdgcn_writelane) {
6122+
IntrinsicID == Intrinsic::amdgcn_writelane ||
6123+
IsPermLane16) {
61086124
Src1 = N->getOperand(2);
6109-
if (IntrinsicID == Intrinsic::amdgcn_writelane)
6125+
if (IntrinsicID == Intrinsic::amdgcn_writelane || IsPermLane16)
61106126
Src2 = N->getOperand(3);
61116127
}
61126128

@@ -6118,10 +6134,17 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
61186134
if (ValSize < 32) {
61196135
SDValue InitBitCast = DAG.getBitcast(IntVT, Src0);
61206136
Src0 = DAG.getAnyExtOrTrunc(InitBitCast, SL, MVT::i32);
6121-
if (Src2.getNode()) {
6137+
6138+
if (IsPermLane16) {
6139+
SDValue Src1Cast = DAG.getBitcast(IntVT, Src1);
6140+
Src1 = DAG.getAnyExtOrTrunc(Src1Cast, SL, MVT::i32);
6141+
}
6142+
6143+
if (IntrinsicID == Intrinsic::amdgcn_writelane) {
61226144
SDValue Src2Cast = DAG.getBitcast(IntVT, Src2);
61236145
Src2 = DAG.getAnyExtOrTrunc(Src2Cast, SL, MVT::i32);
61246146
}
6147+
61256148
SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
61266149
SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
61276150
return DAG.getBitcast(VT, Trunc);
@@ -6131,7 +6154,11 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
61316154
MVT VecVT = MVT::getVectorVT(MVT::i32, ValSize / 32);
61326155
Src0 = DAG.getBitcast(VecVT, Src0);
61336156

6134-
if (Src2.getNode())
6157+
if (IsPermLane16) {
6158+
Src1 = DAG.getBitcast(VecVT, Src1);
6159+
}
6160+
6161+
if (IntrinsicID == Intrinsic::amdgcn_writelane)
61356162
Src2 = DAG.getBitcast(VecVT, Src2);
61366163

61376164
SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
@@ -8612,6 +8639,9 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
86128639
case Intrinsic::amdgcn_readlane:
86138640
case Intrinsic::amdgcn_readfirstlane:
86148641
case Intrinsic::amdgcn_writelane:
8642+
case Intrinsic::amdgcn_permlane16:
8643+
case Intrinsic::amdgcn_permlanex16:
8644+
case Intrinsic::amdgcn_permlane64:
86158645
return lowerLaneOp(*this, Op.getNode(), DAG);
86168646
default:
86178647
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =

llvm/lib/Target/AMDGPU/VOP1Instructions.td

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -717,15 +717,19 @@ def V_ACCVGPR_MOV_B32 : VOP1_Pseudo<"v_accvgpr_mov_b32", VOPProfileAccMov, [], 1
717717
let SubtargetPredicate = isGFX11Plus in {
718718
// Restrict src0 to be VGPR
719719
def V_PERMLANE64_B32 : VOP1_Pseudo<"v_permlane64_b32", VOP_MOVRELS,
720-
getVOP1Pat<int_amdgcn_permlane64,
721-
VOP_MOVRELS>.ret,
722-
/*VOP1Only=*/ 1>;
720+
[], /*VOP1Only=*/ 1>;
723721
defm V_MOV_B16_t16 : VOP1Inst<"v_mov_b16_t16", VOPProfile_True16<VOP_I16_I16>>;
724722
defm V_NOT_B16 : VOP1Inst_t16<"v_not_b16", VOP_I16_I16>;
725723
defm V_CVT_I32_I16 : VOP1Inst_t16<"v_cvt_i32_i16", VOP_I32_I16>;
726724
defm V_CVT_U32_U16 : VOP1Inst_t16<"v_cvt_u32_u16", VOP_I32_I16>;
727725
} // End SubtargetPredicate = isGFX11Plus
728726

727+
foreach vt = Reg32Types.types in {
728+
def : GCNPat<(AMDGPUpermlane64 (vt VRegSrc_32:$src0)),
729+
(vt (V_PERMLANE64_B32 (vt VRegSrc_32:$src0)))
730+
>;
731+
}
732+
729733
//===----------------------------------------------------------------------===//
730734
// Target-specific instruction encodings.
731735
//===----------------------------------------------------------------------===//

0 commit comments

Comments
 (0)