Skip to content

Commit 550b838

Browse files
[AArch64] Lower extending sitofp using tbl
In a similar manner as in https://reviews.llvm.org/D133494 use `TBL` to place bytes in the *upper* part of `i32` elements and then convert to float using fixed-point `scvtf`, i.e. scvtf Vd.4s, Vn.4s, #24 Change-Id: Ib9df3e4243612cbee8560907b24b14e76b61f265
1 parent 17f7ab7 commit 550b838

File tree

3 files changed

+233
-1
lines changed

3 files changed

+233
-1
lines changed

llvm/lib/CodeGen/CodeGenPrepare.cpp

+2-1
Original file line numberDiff line numberDiff line change
@@ -8333,7 +8333,8 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, ModifyDT &ModifiedDT) {
83338333
if (OptimizeNoopCopyExpression(CI, *TLI, *DL))
83348334
return true;
83358335

8336-
if ((isa<UIToFPInst>(I) || isa<FPToUIInst>(I) || isa<TruncInst>(I)) &&
8336+
if ((isa<UIToFPInst>(I) || isa<SIToFPInst>(I) || isa<FPToUIInst>(I) ||
8337+
isa<TruncInst>(I)) &&
83378338
TLI->optimizeExtendOrTruncateConversion(
83388339
I, LI->getLoopFor(I->getParent()), *TTI))
83398340
return true;

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+35
Original file line numberDiff line numberDiff line change
@@ -15763,6 +15763,24 @@ static Value *createTblShuffleForZExt(IRBuilderBase &Builder, Value *Op,
1576315763
return Result;
1576415764
}
1576515765

15766+
static Value *createTblShuffleForSExt(IRBuilderBase &Builder, Value *Op,
15767+
FixedVectorType *DstTy,
15768+
bool IsLittleEndian) {
15769+
auto *SrcTy = cast<FixedVectorType>(Op->getType());
15770+
auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
15771+
auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
15772+
15773+
SmallVector<int> Mask;
15774+
if (!createTblShuffleMask(SrcWidth, DstWidth, SrcTy->getNumElements(),
15775+
!IsLittleEndian, Mask))
15776+
return nullptr;
15777+
15778+
auto *FirstEltZero = Builder.CreateInsertElement(
15779+
PoisonValue::get(SrcTy), Builder.getInt8(0), uint64_t(0));
15780+
15781+
return Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
15782+
}
15783+
1576615784
static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) {
1576715785
IRBuilder<> Builder(TI);
1576815786
SmallVector<Value *> Parts;
@@ -15951,6 +15969,23 @@ bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(
1595115969
return true;
1595215970
}
1595315971

15972+
auto *SIToFP = dyn_cast<SIToFPInst>(I);
15973+
if (SIToFP && SrcTy->getElementType()->isIntegerTy(8) &&
15974+
DstTy->getElementType()->isFloatTy()) {
15975+
IRBuilder<> Builder(I);
15976+
auto *Shuffle = createTblShuffleForSExt(Builder, I->getOperand(0),
15977+
FixedVectorType::getInteger(DstTy),
15978+
Subtarget->isLittleEndian());
15979+
if (!Shuffle)
15980+
return false;
15981+
auto *Cast = Builder.CreateBitCast(Shuffle, VectorType::getInteger(DstTy));
15982+
auto *AShr = Builder.CreateAShr(Cast, 24, "", true);
15983+
auto *SI = Builder.CreateSIToFP(AShr, DstTy);
15984+
I->replaceAllUsesWith(SI);
15985+
I->eraseFromParent();
15986+
return true;
15987+
}
15988+
1595415989
// Convert 'fptoui <(8|16) x float> to <(8|16) x i8>' to a wide fptoui
1595515990
// followed by a truncate lowered to using tbl.4.
1595615991
auto *FPToUI = dyn_cast<FPToUIInst>(I);
+196
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,196 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2+
; RUN: llc -verify-machineinstrs < %s | FileCheck %s
3+
4+
target triple = "aarch64-linux"
5+
6+
; CHECK-LABEL: .LCPI0_0:
7+
; CHECK-NEXT: .byte 255
8+
; CHECK-NEXT: .byte 255
9+
; CHECK-NEXT: .byte 255
10+
; CHECK-NEXT: .byte 4
11+
; CHECK-NEXT: .byte 255
12+
; CHECK-NEXT: .byte 255
13+
; CHECK-NEXT: .byte 255
14+
; CHECK-NEXT: .byte 5
15+
; CHECK-NEXT: .byte 255
16+
; CHECK-NEXT: .byte 255
17+
; CHECK-NEXT: .byte 255
18+
; CHECK-NEXT: .byte 6
19+
; CHECK-NEXT: .byte 255
20+
; CHECK-NEXT: .byte 255
21+
; CHECK-NEXT: .byte 255
22+
; CHECK-NEXT: .byte 7
23+
; CHECK-NEXT: .LCPI0_1:
24+
; CHECK-NEXT: .byte 255
25+
; CHECK-NEXT: .byte 255
26+
; CHECK-NEXT: .byte 255
27+
; CHECK-NEXT: .byte 0
28+
; CHECK-NEXT: .byte 255
29+
; CHECK-NEXT: .byte 255
30+
; CHECK-NEXT: .byte 255
31+
; CHECK-NEXT: .byte 1
32+
; CHECK-NEXT: .byte 255
33+
; CHECK-NEXT: .byte 255
34+
; CHECK-NEXT: .byte 255
35+
; CHECK-NEXT: .byte 2
36+
; CHECK-NEXT: .byte 255
37+
; CHECK-NEXT: .byte 255
38+
; CHECK-NEXT: .byte 255
39+
40+
define void @sitofp_v8i8_to_v8f32(ptr %src, ptr %dst) {
41+
; CHECK-LABEL: sitofp_v8i8_to_v8f32:
42+
; CHECK: // %bb.0: // %entry
43+
; CHECK-NEXT: adrp x8, .LCPI0_0
44+
; CHECK-NEXT: adrp x9, .LCPI0_1
45+
; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI0_0]
46+
; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI0_1]
47+
; CHECK-NEXT: mov x8, xzr
48+
; CHECK-NEXT: .LBB0_1: // %loop
49+
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
50+
; CHECK-NEXT: ldr d2, [x0, x8, lsl #3]
51+
; CHECK-NEXT: add x9, x1, x8, lsl #5
52+
; CHECK-NEXT: add x8, x8, #1
53+
; CHECK-NEXT: cmp x8, #1000
54+
; CHECK-NEXT: tbl v3.16b, { v2.16b }, v0.16b
55+
; CHECK-NEXT: tbl v2.16b, { v2.16b }, v1.16b
56+
; CHECK-NEXT: scvtf v3.4s, v3.4s, #24
57+
; CHECK-NEXT: scvtf v2.4s, v2.4s, #24
58+
; CHECK-NEXT: stp q2, q3, [x9]
59+
; CHECK-NEXT: b.eq .LBB0_1
60+
; CHECK-NEXT: // %bb.2: // %exit
61+
; CHECK-NEXT: ret
62+
entry:
63+
br label %loop
64+
65+
loop:
66+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
67+
%gep.src = getelementptr inbounds <8 x i8>, ptr %src, i64 %iv
68+
%l = load <8 x i8>, ptr %gep.src
69+
%conv = sitofp <8 x i8> %l to <8 x float>
70+
%gep.dst = getelementptr inbounds <8 x float>, ptr %dst, i64 %iv
71+
store <8 x float> %conv, ptr %gep.dst
72+
%iv.next = add i64 %iv, 1
73+
%ec = icmp eq i64 %iv.next, 1000
74+
br i1 %ec, label %loop, label %exit
75+
76+
exit:
77+
ret void
78+
}
79+
80+
; CHECK-LABEL: .LCPI1_0:
81+
; CHECK-NEXT: .byte 255
82+
; CHECK-NEXT: .byte 255
83+
; CHECK-NEXT: .byte 255
84+
; CHECK-NEXT: .byte 12
85+
; CHECK-NEXT: .byte 255
86+
; CHECK-NEXT: .byte 255
87+
; CHECK-NEXT: .byte 255
88+
; CHECK-NEXT: .byte 13
89+
; CHECK-NEXT: .byte 255
90+
; CHECK-NEXT: .byte 255
91+
; CHECK-NEXT: .byte 255
92+
; CHECK-NEXT: .byte 14
93+
; CHECK-NEXT: .byte 255
94+
; CHECK-NEXT: .byte 255
95+
; CHECK-NEXT: .byte 255
96+
; CHECK-NEXT: .byte 15
97+
; CHECK-NEXT: .LCPI1_1:
98+
; CHECK-NEXT: .byte 255
99+
; CHECK-NEXT: .byte 255
100+
; CHECK-NEXT: .byte 255
101+
; CHECK-NEXT: .byte 8
102+
; CHECK-NEXT: .byte 255
103+
; CHECK-NEXT: .byte 255
104+
; CHECK-NEXT: .byte 255
105+
; CHECK-NEXT: .byte 9
106+
; CHECK-NEXT: .byte 255
107+
; CHECK-NEXT: .byte 255
108+
; CHECK-NEXT: .byte 255
109+
; CHECK-NEXT: .byte 10
110+
; CHECK-NEXT: .byte 255
111+
; CHECK-NEXT: .byte 255
112+
; CHECK-NEXT: .byte 255
113+
; CHECK-NEXT: .byte 11
114+
; CHECK-NEXT: .LCPI1_2:
115+
; CHECK-NEXT: .byte 255
116+
; CHECK-NEXT: .byte 255
117+
; CHECK-NEXT: .byte 255
118+
; CHECK-NEXT: .byte 4
119+
; CHECK-NEXT: .byte 255
120+
; CHECK-NEXT: .byte 255
121+
; CHECK-NEXT: .byte 255
122+
; CHECK-NEXT: .byte 5
123+
; CHECK-NEXT: .byte 255
124+
; CHECK-NEXT: .byte 255
125+
; CHECK-NEXT: .byte 255
126+
; CHECK-NEXT: .byte 6
127+
; CHECK-NEXT: .byte 255
128+
; CHECK-NEXT: .byte 255
129+
; CHECK-NEXT: .byte 255
130+
; CHECK-NEXT: .byte 7
131+
; CHECK-NEXT: .LCPI1_3:
132+
; CHECK-NEXT: .byte 255
133+
; CHECK-NEXT: .byte 255
134+
; CHECK-NEXT: .byte 255
135+
; CHECK-NEXT: .byte 0
136+
; CHECK-NEXT: .byte 255
137+
; CHECK-NEXT: .byte 255
138+
; CHECK-NEXT: .byte 255
139+
; CHECK-NEXT: .byte 1
140+
; CHECK-NEXT: .byte 255
141+
; CHECK-NEXT: .byte 255
142+
; CHECK-NEXT: .byte 255
143+
; CHECK-NEXT: .byte 2
144+
; CHECK-NEXT: .byte 255
145+
; CHECK-NEXT: .byte 255
146+
; CHECK-NEXT: .byte 255
147+
; CHECK-NEXT: .byte 3
148+
149+
define void @sitofp_v16i8_to_v16f32(ptr %src, ptr %dst) {
150+
; CHECK-LABEL: sitofp_v16i8_to_v16f32:
151+
; CHECK: // %bb.0: // %entry
152+
; CHECK-NEXT: adrp x8, .LCPI1_0
153+
; CHECK-NEXT: adrp x9, .LCPI1_1
154+
; CHECK-NEXT: adrp x10, .LCPI1_2
155+
; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI1_0]
156+
; CHECK-NEXT: adrp x8, .LCPI1_3
157+
; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI1_1]
158+
; CHECK-NEXT: ldr q2, [x10, :lo12:.LCPI1_2]
159+
; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI1_3]
160+
; CHECK-NEXT: mov x8, xzr
161+
; CHECK-NEXT: .LBB1_1: // %loop
162+
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
163+
; CHECK-NEXT: ldr q4, [x0, x8, lsl #4]
164+
; CHECK-NEXT: add x9, x1, x8, lsl #6
165+
; CHECK-NEXT: add x8, x8, #1
166+
; CHECK-NEXT: cmp x8, #1000
167+
; CHECK-NEXT: tbl v5.16b, { v4.16b }, v0.16b
168+
; CHECK-NEXT: tbl v6.16b, { v4.16b }, v1.16b
169+
; CHECK-NEXT: tbl v7.16b, { v4.16b }, v2.16b
170+
; CHECK-NEXT: tbl v4.16b, { v4.16b }, v3.16b
171+
; CHECK-NEXT: scvtf v5.4s, v5.4s, #24
172+
; CHECK-NEXT: scvtf v6.4s, v6.4s, #24
173+
; CHECK-NEXT: scvtf v7.4s, v7.4s, #24
174+
; CHECK-NEXT: scvtf v4.4s, v4.4s, #24
175+
; CHECK-NEXT: stp q6, q5, [x9, #32]
176+
; CHECK-NEXT: stp q4, q7, [x9]
177+
; CHECK-NEXT: b.eq .LBB1_1
178+
; CHECK-NEXT: // %bb.2: // %exit
179+
; CHECK-NEXT: ret
180+
entry:
181+
br label %loop
182+
183+
loop:
184+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
185+
%gep.src = getelementptr inbounds <16 x i8>, ptr %src, i64 %iv
186+
%l = load <16 x i8>, ptr %gep.src
187+
%conv = sitofp <16 x i8> %l to <16 x float>
188+
%gep.dst = getelementptr inbounds <16 x float>, ptr %dst, i64 %iv
189+
store <16 x float> %conv, ptr %gep.dst
190+
%iv.next = add i64 %iv, 1
191+
%ec = icmp eq i64 %iv.next, 1000
192+
br i1 %ec, label %loop, label %exit
193+
194+
exit:
195+
ret void
196+
}

0 commit comments

Comments
 (0)