Skip to content

Commit c91cd4f

Browse files
author
Joe Ellis
committed
[AArch64][SVE][InstCombine] Replace last{a,b} intrinsics with extracts...
when the predicate used by last{a,b} specifies a known vector length. For example: aarch64_sve_lasta(VL1, D) -> extractelement(D, #1) aarch64_sve_lastb(VL1, D) -> extractelement(D, #0) Co-authored-by: Paul Walker <[email protected]> Differential Revision: https://reviews.llvm.org/D100476
1 parent 90248f2 commit c91cd4f

File tree

3 files changed

+250
-0
lines changed

3 files changed

+250
-0
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include "llvm/IR/IntrinsicsAArch64.h"
1919
#include "llvm/IR/PatternMatch.h"
2020
#include "llvm/Support/Debug.h"
21+
#include "llvm/Transforms/InstCombine/InstCombiner.h"
2122
#include <algorithm>
2223
using namespace llvm;
2324
using namespace llvm::PatternMatch;
@@ -279,6 +280,101 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
279280
return BaseT::getIntrinsicInstrCost(ICA, CostKind);
280281
}
281282

283+
static Optional<Instruction *> instCombineSVELast(InstCombiner &IC,
284+
IntrinsicInst &II) {
285+
Value *Pg = II.getArgOperand(0);
286+
Value *Vec = II.getArgOperand(1);
287+
bool IsAfter = II.getIntrinsicID() == Intrinsic::aarch64_sve_lasta;
288+
289+
auto *C = dyn_cast<Constant>(Pg);
290+
if (IsAfter && C && C->isNullValue()) {
291+
// The intrinsic is extracting lane 0 so use an extract instead.
292+
auto *IdxTy = Type::getInt64Ty(II.getContext());
293+
auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
294+
Extract->insertBefore(&II);
295+
Extract->takeName(&II);
296+
return IC.replaceInstUsesWith(II, Extract);
297+
}
298+
299+
auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
300+
if (!IntrPG)
301+
return None;
302+
303+
if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
304+
return None;
305+
306+
const auto PTruePattern =
307+
cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
308+
309+
// Can the intrinsic's predicate be converted to a known constant index?
310+
unsigned Idx;
311+
switch (PTruePattern) {
312+
default:
313+
return None;
314+
case AArch64SVEPredPattern::vl1:
315+
Idx = 0;
316+
break;
317+
case AArch64SVEPredPattern::vl2:
318+
Idx = 1;
319+
break;
320+
case AArch64SVEPredPattern::vl3:
321+
Idx = 2;
322+
break;
323+
case AArch64SVEPredPattern::vl4:
324+
Idx = 3;
325+
break;
326+
case AArch64SVEPredPattern::vl5:
327+
Idx = 4;
328+
break;
329+
case AArch64SVEPredPattern::vl6:
330+
Idx = 5;
331+
break;
332+
case AArch64SVEPredPattern::vl7:
333+
Idx = 6;
334+
break;
335+
case AArch64SVEPredPattern::vl8:
336+
Idx = 7;
337+
break;
338+
case AArch64SVEPredPattern::vl16:
339+
Idx = 15;
340+
break;
341+
}
342+
343+
// Increment the index if extracting the element after the last active
344+
// predicate element.
345+
if (IsAfter)
346+
++Idx;
347+
348+
// Ignore extracts whose index is larger than the known minimum vector
349+
// length. NOTE: This is an artificial constraint where we prefer to
350+
// maintain what the user asked for until an alternative is proven faster.
351+
auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
352+
if (Idx >= PgVTy->getMinNumElements())
353+
return None;
354+
355+
// The intrinsic is extracting a fixed lane so use an extract instead.
356+
auto *IdxTy = Type::getInt64Ty(II.getContext());
357+
auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
358+
Extract->insertBefore(&II);
359+
Extract->takeName(&II);
360+
return IC.replaceInstUsesWith(II, Extract);
361+
}
362+
363+
Optional<Instruction *>
364+
AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
365+
IntrinsicInst &II) const {
366+
Intrinsic::ID IID = II.getIntrinsicID();
367+
switch (IID) {
368+
default:
369+
break;
370+
case Intrinsic::aarch64_sve_lasta:
371+
case Intrinsic::aarch64_sve_lastb:
372+
return instCombineSVELast(IC, II);
373+
}
374+
375+
return None;
376+
}
377+
282378
bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
283379
ArrayRef<const Value *> Args) {
284380

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,9 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
100100
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
101101
TTI::TargetCostKind CostKind);
102102

103+
Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
104+
IntrinsicInst &II) const;
105+
103106
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
104107
switch (K) {
105108
case TargetTransformInfo::RGK_Scalar:
Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2+
; RUN: opt -instcombine -S < %s | FileCheck --check-prefix OPT %s
3+
4+
target triple = "aarch64"
5+
6+
; Most of the testing is covered by the lastb cases, but here we ensure that
7+
; lasta with a predicate having no active lanes is treated as an alias to
8+
; extracting the first vector element.
9+
define i8 @lasta_extractelement_0(<vscale x 16 x i8> %v) #0 {
10+
; OPT-LABEL: @lasta_extractelement_0(
11+
; OPT-NEXT: [[E0:%.*]] = extractelement <vscale x 16 x i8> [[V:%.*]], i64 0
12+
; OPT-NEXT: ret i8 [[E0]]
13+
;
14+
%e0 = tail call i8 @llvm.aarch64.sve.lasta.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %v)
15+
ret i8 %e0
16+
}
17+
18+
; Most of the testing is covered by the lastb cases, but here we check the
19+
; resulting extraction index is one more than the lastb case because lasta
20+
; extracts the element after the last active.
21+
define i8 @lasta_extractelement_8(<vscale x 16 x i8> %v) #0 {
22+
; OPT-LABEL: @lasta_extractelement_8(
23+
; OPT-NEXT: [[E1:%.*]] = extractelement <vscale x 16 x i8> [[V:%.*]], i64 8
24+
; OPT-NEXT: ret i8 [[E1]]
25+
;
26+
%pg = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 8)
27+
%e1 = tail call i8 @llvm.aarch64.sve.lasta.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %v)
28+
ret i8 %e1
29+
}
30+
31+
define i8 @lastb_extractelement_0(<vscale x 16 x i8> %v) #0 {
32+
; OPT-LABEL: @lastb_extractelement_0(
33+
; OPT-NEXT: [[E0:%.*]] = extractelement <vscale x 16 x i8> [[V:%.*]], i64 0
34+
; OPT-NEXT: ret i8 [[E0]]
35+
;
36+
%pg = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 1)
37+
%e0 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %v)
38+
ret i8 %e0
39+
}
40+
41+
define i8 @lastb_extractelement_1(<vscale x 16 x i8> %v) #0 {
42+
; OPT-LABEL: @lastb_extractelement_1(
43+
; OPT-NEXT: [[E1:%.*]] = extractelement <vscale x 16 x i8> [[V:%.*]], i64 1
44+
; OPT-NEXT: ret i8 [[E1]]
45+
;
46+
%pg = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 2)
47+
%e1 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %v)
48+
ret i8 %e1
49+
}
50+
51+
define i8 @lastb_extractelement_2(<vscale x 16 x i8> %v) #0 {
52+
; OPT-LABEL: @lastb_extractelement_2(
53+
; OPT-NEXT: [[E2:%.*]] = extractelement <vscale x 16 x i8> [[V:%.*]], i64 2
54+
; OPT-NEXT: ret i8 [[E2]]
55+
;
56+
%pg = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 3)
57+
%e2 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %v)
58+
ret i8 %e2
59+
}
60+
61+
define i8 @lastb_extractelement_3(<vscale x 16 x i8> %v) #0 {
62+
; OPT-LABEL: @lastb_extractelement_3(
63+
; OPT-NEXT: [[E3:%.*]] = extractelement <vscale x 16 x i8> [[V:%.*]], i64 3
64+
; OPT-NEXT: ret i8 [[E3]]
65+
;
66+
%pg = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 4)
67+
%e3 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %v)
68+
ret i8 %e3
69+
}
70+
71+
define i8 @lastb_extractelement_4(<vscale x 16 x i8> %v) #0 {
72+
; OPT-LABEL: @lastb_extractelement_4(
73+
; OPT-NEXT: [[E4:%.*]] = extractelement <vscale x 16 x i8> [[V:%.*]], i64 4
74+
; OPT-NEXT: ret i8 [[E4]]
75+
;
76+
%pg = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 5)
77+
%e4 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %v)
78+
ret i8 %e4
79+
}
80+
81+
define i8 @lastb_extractelement_5(<vscale x 16 x i8> %v) #0 {
82+
; OPT-LABEL: @lastb_extractelement_5(
83+
; OPT-NEXT: [[E5:%.*]] = extractelement <vscale x 16 x i8> [[V:%.*]], i64 5
84+
; OPT-NEXT: ret i8 [[E5]]
85+
;
86+
%pg = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 6)
87+
%e5 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %v)
88+
ret i8 %e5
89+
}
90+
91+
define i8 @lastb_extractelement_6(<vscale x 16 x i8> %v) #0 {
92+
; OPT-LABEL: @lastb_extractelement_6(
93+
; OPT-NEXT: [[E6:%.*]] = extractelement <vscale x 16 x i8> [[V:%.*]], i64 6
94+
; OPT-NEXT: ret i8 [[E6]]
95+
;
96+
%pg = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 7)
97+
%e6 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %v)
98+
ret i8 %e6
99+
}
100+
101+
define i8 @lastb_extractelement_7(<vscale x 16 x i8> %v) #0 {
102+
; OPT-LABEL: @lastb_extractelement_7(
103+
; OPT-NEXT: [[E7:%.*]] = extractelement <vscale x 16 x i8> [[V:%.*]], i64 7
104+
; OPT-NEXT: ret i8 [[E7]]
105+
;
106+
%pg = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 8)
107+
%e7 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %v)
108+
ret i8 %e7
109+
}
110+
111+
define i8 @lastb_extractelement_15(<vscale x 16 x i8> %v) #0 {
112+
; OPT-LABEL: @lastb_extractelement_15(
113+
; OPT-NEXT: [[E15:%.*]] = extractelement <vscale x 16 x i8> [[V:%.*]], i64 15
114+
; OPT-NEXT: ret i8 [[E15]]
115+
;
116+
%pg = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 9)
117+
%e15 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %v)
118+
ret i8 %e15
119+
}
120+
121+
; No transformation because the requested element is beyond the range of the
122+
; known minimum element count so we maintain the user's intentions.
123+
define i8 @lastb_extractelement_31(<vscale x 16 x i8> %v) #0 {
124+
; OPT-LABEL: @lastb_extractelement_31(
125+
; OPT-NEXT: [[PG:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 10)
126+
; OPT-NEXT: [[E31:%.*]] = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8(<vscale x 16 x i1> [[PG]], <vscale x 16 x i8> [[V:%.*]])
127+
; OPT-NEXT: ret i8 [[E31]]
128+
;
129+
%pg = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 10)
130+
%e31 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %v)
131+
ret i8 %e31
132+
}
133+
134+
; No transformation because the ptrue's predicate pattern is bogus and thus
135+
; nothing can be inferred about the result.
136+
define i8 @lastb_extractelement_invalid_predicate_pattern(<vscale x 16 x i8> %v) #0 {
137+
; OPT-LABEL: @lastb_extractelement_invalid_predicate_pattern(
138+
; OPT-NEXT: [[PG:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 15)
139+
; OPT-NEXT: [[E:%.*]] = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8(<vscale x 16 x i1> [[PG]], <vscale x 16 x i8> [[V:%.*]])
140+
; OPT-NEXT: ret i8 [[E]]
141+
;
142+
%pg = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 15)
143+
%e = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %v)
144+
ret i8 %e
145+
}
146+
147+
declare <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32)
148+
declare i8 @llvm.aarch64.sve.lasta.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>)
149+
declare i8 @llvm.aarch64.sve.lastb.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>)
150+
151+
attributes #0 = { "target-features"="+sve" }

0 commit comments

Comments
 (0)