Skip to content

[Clang][LLVM][AArch64] Add intrinsic for LUTI4 SME2 instruction #97755

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Sep 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions clang/include/clang/Basic/arm_sme.td
Original file line number Diff line number Diff line change
Expand Up @@ -817,4 +817,9 @@ multiclass ZAReadzArray<string vg_num>{

defm SVREADZ_VG2 : ZAReadzArray<"2">;
defm SVREADZ_VG4 : ZAReadzArray<"4">;

let SMETargetGuard = "sme2,sme-lutv2" in {
def SVLUTI4_ZT_X4 : SInst<"svluti4_zt_{d}_x4", "4i2.u", "cUc", MergeNone, "aarch64_sme_luti4_zt_x4", [IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>]>;
}

} // let SVETargetGuard = InvalidMode
82 changes: 82 additions & 0 deletions clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_zt.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-lutv2 -O2 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-lutv2 -O2 -Werror -Wall -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-CXX
// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-lutv2 -O2 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -x c++ -triple aarch64-none-linux-gnu -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-lutv2 -O2 -Werror -Wall -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-CXX

// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-lutv2 -O2 -S -Werror -Wall -o /dev/null %s
// REQUIRES: aarch64-registered-target

#include <arm_sme.h>

// CHECK-LABEL: define dso_local <vscale x 64 x i8> @test_luti4_zt_u8_x4(
// CHECK-SAME: <vscale x 32 x i8> [[OP:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[OP]], i64 0)
// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[OP]], i64 16)
// CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti4.zt.x4.nxv16i8(i32 0, <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]])
// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0
// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP3]], i64 0)
// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1
// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 16)
// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 2
// CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 32)
// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 3
// CHECK-NEXT: [[TMP10:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP8]], <vscale x 16 x i8> [[TMP9]], i64 48)
// CHECK-NEXT: ret <vscale x 64 x i8> [[TMP10]]
//
// CHECK-CXX-LABEL: define dso_local <vscale x 64 x i8> @_Z19test_luti4_zt_u8_x411svuint8x2_t(
// CHECK-CXX-SAME: <vscale x 32 x i8> [[OP:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
// CHECK-CXX-NEXT: [[ENTRY:.*:]]
// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[OP]], i64 0)
// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[OP]], i64 16)
// CHECK-CXX-NEXT: [[TMP2:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti4.zt.x4.nxv16i8(i32 0, <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]])
// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0
// CHECK-CXX-NEXT: [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP3]], i64 0)
// CHECK-CXX-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1
// CHECK-CXX-NEXT: [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 16)
// CHECK-CXX-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 2
// CHECK-CXX-NEXT: [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 32)
// CHECK-CXX-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 3
// CHECK-CXX-NEXT: [[TMP10:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP8]], <vscale x 16 x i8> [[TMP9]], i64 48)
// CHECK-CXX-NEXT: ret <vscale x 64 x i8> [[TMP10]]
//
svuint8x4_t test_luti4_zt_u8_x4(svuint8x2_t op) __arm_streaming __arm_in("zt0") {
return svluti4_zt_u8_x4(0, op);
}

// CHECK-LABEL: define dso_local <vscale x 64 x i8> @test_luti4_zt_s8_x4(
// CHECK-SAME: <vscale x 32 x i8> [[OP:%.*]]) local_unnamed_addr #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[OP]], i64 0)
// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[OP]], i64 16)
// CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti4.zt.x4.nxv16i8(i32 0, <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]])
// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0
// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP3]], i64 0)
// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1
// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 16)
// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 2
// CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 32)
// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 3
// CHECK-NEXT: [[TMP10:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP8]], <vscale x 16 x i8> [[TMP9]], i64 48)
// CHECK-NEXT: ret <vscale x 64 x i8> [[TMP10]]
//
// CHECK-CXX-LABEL: define dso_local <vscale x 64 x i8> @_Z19test_luti4_zt_s8_x411svuint8x2_t(
// CHECK-CXX-SAME: <vscale x 32 x i8> [[OP:%.*]]) local_unnamed_addr #[[ATTR0]] {
// CHECK-CXX-NEXT: [[ENTRY:.*:]]
// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[OP]], i64 0)
// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[OP]], i64 16)
// CHECK-CXX-NEXT: [[TMP2:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti4.zt.x4.nxv16i8(i32 0, <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]])
// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0
// CHECK-CXX-NEXT: [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP3]], i64 0)
// CHECK-CXX-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1
// CHECK-CXX-NEXT: [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 16)
// CHECK-CXX-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 2
// CHECK-CXX-NEXT: [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 32)
// CHECK-CXX-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 3
// CHECK-CXX-NEXT: [[TMP10:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP8]], <vscale x 16 x i8> [[TMP9]], i64 48)
// CHECK-CXX-NEXT: ret <vscale x 64 x i8> [[TMP10]]
//
svint8x4_t test_luti4_zt_s8_x4(svuint8x2_t op) __arm_streaming __arm_in("zt0") {
return svluti4_zt_s8_x4(0, op);
}
5 changes: 5 additions & 0 deletions clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -350,3 +350,8 @@ void test_svdot_multi_za32_bad_lane(uint32_t slice_base, svuint16_t z_u16,
svsudot_lane_za32_s8_vg1x2(slice_base, z_s8x2, z_u8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
svsudot_lane_za32_s8_vg1x4(slice_base, z_s8x4, z_u8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
}

void test_luti4_zt_x4(svuint8x2_t op) __arm_streaming __arm_in("zt0") {
// Check Zt tile 0
svluti4_zt_u8_x4(1, op); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
}
6 changes: 6 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsAArch64.td
Original file line number Diff line number Diff line change
Expand Up @@ -3678,6 +3678,12 @@ let TargetPrefix = "aarch64" in {
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
[llvm_i32_ty, llvm_nxv16i8_ty, llvm_i32_ty],
[ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, IntrReadMem]>;

def int_aarch64_sme_luti4_zt_x4
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
[llvm_i32_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty],
[ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>;

}

// SVE2.1 - ZIPQ1, ZIPQ2, UZPQ1, UZPQ2
Expand Down
19 changes: 16 additions & 3 deletions llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -410,7 +410,7 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
}

void SelectMultiVectorLuti(SDNode *Node, unsigned NumOutVecs, unsigned Opc,
uint32_t MaxImm);
uint32_t MaxImm, bool IsMultiVector = false);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: Could this bool be renamed to something like AreIndicesMultiVector?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are 2 functions now, one for lane and without. So this is not used anymore.


template <unsigned MaxIdx, unsigned Scale>
bool SelectSMETileSlice(SDValue N, SDValue &Vector, SDValue &Offset) {
Expand Down Expand Up @@ -1896,15 +1896,23 @@ void AArch64DAGToDAGISel::SelectFrintFromVT(SDNode *N, unsigned NumVecs,

void AArch64DAGToDAGISel::SelectMultiVectorLuti(SDNode *Node,
unsigned NumOutVecs,
unsigned Opc, uint32_t MaxImm) {
unsigned Opc, uint32_t MaxImm,
bool IsMultiVector) {
if (ConstantSDNode *Imm = dyn_cast<ConstantSDNode>(Node->getOperand(4)))
if (Imm->getZExtValue() > MaxImm)
return;

SDValue ZtValue;
SmallVector<SDValue, 4> Ops;
if (!ImmToReg<AArch64::ZT0, 0>(Node->getOperand(2), ZtValue))
return;
SDValue Ops[] = {ZtValue, Node->getOperand(3), Node->getOperand(4)};
Ops.push_back(ZtValue);
if (IsMultiVector) {
Ops.push_back(createZMulTuple({Node->getOperand(3), Node->getOperand(4)}));
} else {
Ops.push_back(Node->getOperand(3));
Ops.push_back(Node->getOperand(4));
}
SDLoc DL(Node);
EVT VT = Node->getValueType(0);

Expand Down Expand Up @@ -5415,6 +5423,11 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
SelectMultiVectorLuti(Node, 2, Opc, 3);
return;
}
case Intrinsic::aarch64_sme_luti4_zt_x4: {
// Does not have immediate but it has 2ZPR input
SelectMultiVectorLuti(Node, 4, AArch64::LUTI4_4ZZT2Z, 0, true);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No strong opinion, but as isMultiVector and MaxImm are mutually exclusive parameters (i.e, if you supply one, it makes no sense to supply the other), you could separate this into two separate functions.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I create a SelectMultiVectorLutiLane and a SelectMultiVectorLuti.

return;
}
}
} break;
case ISD::INTRINSIC_WO_CHAIN: {
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -936,7 +936,7 @@ defm FAMIN_4Z4Z : sme2_fp_sve_destructive_vector_vg4_multi<"famin", 0b0010101>;

let Predicates = [HasSME2, HasSME_LUTv2] in {
defm MOVT : sme2_movt_zt_to_zt<"movt", 0b0011111>;
def LUTI4_4ZZT2Z : sme2_luti4_vector_vg4<0b00, 0b00,"luti4">;
def LUTI4_4ZZT2Z : sme2_luti4_vector_vg4<0b00, 0b00,"luti4">;
} //[HasSME2, HasSME_LUTv2]

let Predicates = [HasSME2p1, HasSME_LUTv2] in {
Expand Down
17 changes: 17 additions & 0 deletions llvm/test/CodeGen/AArch64/sme2-intrinsics-write-zt.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -verify-machineinstrs -force-streaming < %s | FileCheck %s

target triple = "aarch64-linux"

define {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @test_luti4_zt_i8(<vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1) #0 {
; CHECK-LABEL: test_luti4_zt_i8:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
; CHECK-NEXT: luti4 { z0.b - z3.b }, zt0, { z0, z1 }
; CHECK-NEXT: ret
%res = call {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.luti4.zt.x4.nxv16i8(i32 0, <vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1)
ret {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} %res
}

attributes #0 = { "target-features"="+sme2,+sme-lutv2"}
Loading