-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[Clang][LLVM][AArch64] Add intrinsic for LUTI4 SME2 instruction #97755
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 | ||
// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-lutv2 -O2 -Werror -Wall -emit-llvm -o - %s | FileCheck %s | ||
// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-lutv2 -O2 -Werror -Wall -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-CXX | ||
// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-lutv2 -O2 -Werror -Wall -emit-llvm -o - %s | FileCheck %s | ||
// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -x c++ -triple aarch64-none-linux-gnu -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-lutv2 -O2 -Werror -Wall -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-CXX | ||
|
||
// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-lutv2 -O2 -S -Werror -Wall -o /dev/null %s | ||
// REQUIRES: aarch64-registered-target | ||
|
||
#include <arm_sme.h> | ||
|
||
// CHECK-LABEL: define dso_local <vscale x 64 x i8> @test_luti4_zt_u8_x4( | ||
// CHECK-SAME: <vscale x 32 x i8> [[OP:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { | ||
// CHECK-NEXT: [[ENTRY:.*:]] | ||
// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[OP]], i64 0) | ||
// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[OP]], i64 16) | ||
// CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti4.zt.x4.nxv16i8(i32 0, <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]]) | ||
// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0 | ||
// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP3]], i64 0) | ||
// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1 | ||
// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 16) | ||
// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 2 | ||
// CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 32) | ||
// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 3 | ||
// CHECK-NEXT: [[TMP10:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP8]], <vscale x 16 x i8> [[TMP9]], i64 48) | ||
// CHECK-NEXT: ret <vscale x 64 x i8> [[TMP10]] | ||
// | ||
// CHECK-CXX-LABEL: define dso_local <vscale x 64 x i8> @_Z19test_luti4_zt_u8_x411svuint8x2_t( | ||
// CHECK-CXX-SAME: <vscale x 32 x i8> [[OP:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { | ||
// CHECK-CXX-NEXT: [[ENTRY:.*:]] | ||
// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[OP]], i64 0) | ||
// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[OP]], i64 16) | ||
// CHECK-CXX-NEXT: [[TMP2:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti4.zt.x4.nxv16i8(i32 0, <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]]) | ||
// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0 | ||
// CHECK-CXX-NEXT: [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP3]], i64 0) | ||
// CHECK-CXX-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1 | ||
// CHECK-CXX-NEXT: [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 16) | ||
// CHECK-CXX-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 2 | ||
// CHECK-CXX-NEXT: [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 32) | ||
// CHECK-CXX-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 3 | ||
// CHECK-CXX-NEXT: [[TMP10:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP8]], <vscale x 16 x i8> [[TMP9]], i64 48) | ||
// CHECK-CXX-NEXT: ret <vscale x 64 x i8> [[TMP10]] | ||
// | ||
svuint8x4_t test_luti4_zt_u8_x4(svuint8x2_t op) __arm_streaming __arm_in("zt0") { | ||
return svluti4_zt_u8_x4(0, op); | ||
} | ||
|
||
// CHECK-LABEL: define dso_local <vscale x 64 x i8> @test_luti4_zt_s8_x4( | ||
// CHECK-SAME: <vscale x 32 x i8> [[OP:%.*]]) local_unnamed_addr #[[ATTR0]] { | ||
// CHECK-NEXT: [[ENTRY:.*:]] | ||
// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[OP]], i64 0) | ||
// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[OP]], i64 16) | ||
// CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti4.zt.x4.nxv16i8(i32 0, <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]]) | ||
// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0 | ||
// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP3]], i64 0) | ||
// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1 | ||
// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 16) | ||
// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 2 | ||
// CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 32) | ||
// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 3 | ||
// CHECK-NEXT: [[TMP10:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP8]], <vscale x 16 x i8> [[TMP9]], i64 48) | ||
// CHECK-NEXT: ret <vscale x 64 x i8> [[TMP10]] | ||
// | ||
// CHECK-CXX-LABEL: define dso_local <vscale x 64 x i8> @_Z19test_luti4_zt_s8_x411svuint8x2_t( | ||
// CHECK-CXX-SAME: <vscale x 32 x i8> [[OP:%.*]]) local_unnamed_addr #[[ATTR0]] { | ||
// CHECK-CXX-NEXT: [[ENTRY:.*:]] | ||
// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[OP]], i64 0) | ||
// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[OP]], i64 16) | ||
// CHECK-CXX-NEXT: [[TMP2:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti4.zt.x4.nxv16i8(i32 0, <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]]) | ||
// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0 | ||
// CHECK-CXX-NEXT: [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP3]], i64 0) | ||
// CHECK-CXX-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1 | ||
// CHECK-CXX-NEXT: [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 16) | ||
// CHECK-CXX-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 2 | ||
// CHECK-CXX-NEXT: [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 32) | ||
// CHECK-CXX-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 3 | ||
// CHECK-CXX-NEXT: [[TMP10:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP8]], <vscale x 16 x i8> [[TMP9]], i64 48) | ||
// CHECK-CXX-NEXT: ret <vscale x 64 x i8> [[TMP10]] | ||
// | ||
svint8x4_t test_luti4_zt_s8_x4(svuint8x2_t op) __arm_streaming __arm_in("zt0") { | ||
return svluti4_zt_s8_x4(0, op); | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -410,7 +410,7 @@ class AArch64DAGToDAGISel : public SelectionDAGISel { | |
} | ||
|
||
void SelectMultiVectorLuti(SDNode *Node, unsigned NumOutVecs, unsigned Opc, | ||
uint32_t MaxImm); | ||
uint32_t MaxImm, bool IsMultiVector = false); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: Could this There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There are 2 functions now, one for lane and without. So this is not used anymore. |
||
|
||
template <unsigned MaxIdx, unsigned Scale> | ||
bool SelectSMETileSlice(SDValue N, SDValue &Vector, SDValue &Offset) { | ||
|
@@ -1896,15 +1896,23 @@ void AArch64DAGToDAGISel::SelectFrintFromVT(SDNode *N, unsigned NumVecs, | |
|
||
void AArch64DAGToDAGISel::SelectMultiVectorLuti(SDNode *Node, | ||
unsigned NumOutVecs, | ||
unsigned Opc, uint32_t MaxImm) { | ||
unsigned Opc, uint32_t MaxImm, | ||
bool IsMultiVector) { | ||
if (ConstantSDNode *Imm = dyn_cast<ConstantSDNode>(Node->getOperand(4))) | ||
if (Imm->getZExtValue() > MaxImm) | ||
return; | ||
|
||
SDValue ZtValue; | ||
SmallVector<SDValue, 4> Ops; | ||
if (!ImmToReg<AArch64::ZT0, 0>(Node->getOperand(2), ZtValue)) | ||
return; | ||
SDValue Ops[] = {ZtValue, Node->getOperand(3), Node->getOperand(4)}; | ||
Ops.push_back(ZtValue); | ||
if (IsMultiVector) { | ||
Ops.push_back(createZMulTuple({Node->getOperand(3), Node->getOperand(4)})); | ||
} else { | ||
Ops.push_back(Node->getOperand(3)); | ||
Ops.push_back(Node->getOperand(4)); | ||
} | ||
SDLoc DL(Node); | ||
EVT VT = Node->getValueType(0); | ||
|
||
|
@@ -5415,6 +5423,11 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { | |
SelectMultiVectorLuti(Node, 2, Opc, 3); | ||
return; | ||
} | ||
case Intrinsic::aarch64_sme_luti4_zt_x4: { | ||
// Does not have immediate but it has 2ZPR input | ||
SelectMultiVectorLuti(Node, 4, AArch64::LUTI4_4ZZT2Z, 0, true); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No strong opinion, but as There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I create a SelectMultiVectorLutiLane and a SelectMultiVectorLuti. |
||
return; | ||
} | ||
} | ||
} break; | ||
case ISD::INTRINSIC_WO_CHAIN: { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 | ||
; RUN: llc -verify-machineinstrs -force-streaming < %s | FileCheck %s | ||
|
||
target triple = "aarch64-linux" | ||
|
||
define {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @test_luti4_zt_i8(<vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1) #0 { | ||
; CHECK-LABEL: test_luti4_zt_i8: | ||
; CHECK: // %bb.0: | ||
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 | ||
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 | ||
; CHECK-NEXT: luti4 { z0.b - z3.b }, zt0, { z0, z1 } | ||
; CHECK-NEXT: ret | ||
%res = call {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.luti4.zt.x4.nxv16i8(i32 0, <vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1) | ||
ret {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} %res | ||
} | ||
|
||
attributes #0 = { "target-features"="+sme2,+sme-lutv2"} |
Uh oh!
There was an error while loading. Please reload this page.