Skip to content

Commit 733c6b1

Browse files
author
Evan Cheng
committed
LLVM sdisel normalize bit extraction of the form:
((x & 0xff00) >> 8) << 2 to (x >> 6) & 0x3fc This is general goodness since it folds a left shift into the mask. However, the trailing zeros in the mask prevents the ARM backend from using the bit extraction instructions. And worse since the mask materialization may require an addition instruction. This comes up fairly frequently when the result of the bit twiddling is used as memory address. e.g. = ptr[(x & 0xFF0000) >> 16] We want to generate: ubfx r3, r1, #16, #8 ldr.w r3, [r0, r3, lsl #2] vs. mov.w r9, #1020 and.w r2, r9, r1, lsr #14 ldr r2, [r0, r2] Add a late ARM specific isel optimization to ARMDAGToDAGISel::PreprocessISelDAG(). It folds the left shift to the 'base + offset' address computation; change the mask to one which doesn't have trailing zeros and enable the use of ubfx. Note the optimization has to be done late since it's target specific and we don't want to change the DAG normalization. It's also fairly restrictive as shifter operands are not always free. It's only done for lsh 1 / 2. It's known to be free on some cpus and they are most common for address computation. This is a slight win for blowfish, rijndael, etc. rdar://12870177 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@170581 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent 28d24c9 commit 733c6b1

File tree

2 files changed

+132
-2
lines changed

2 files changed

+132
-2
lines changed

lib/Target/ARM/ARMISelDAGToDAG.cpp

Lines changed: 107 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,8 @@ class ARMDAGToDAGISel : public SelectionDAGISel {
7878
return "ARM Instruction Selection";
7979
}
8080

81+
virtual void PreprocessISelDAG();
82+
8183
/// getI32Imm - Return a target constant of type i32 with the specified
8284
/// value.
8385
inline SDValue getI32Imm(unsigned Imm) {
@@ -327,6 +329,87 @@ static bool isScaledConstantInRange(SDValue Node, int Scale,
327329
return ScaledConstant >= RangeMin && ScaledConstant < RangeMax;
328330
}
329331

332+
void ARMDAGToDAGISel::PreprocessISelDAG() {
333+
if (!Subtarget->hasV6T2Ops())
334+
return;
335+
336+
bool isThumb2 = Subtarget->isThumb();
337+
for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
338+
E = CurDAG->allnodes_end(); I != E; ) {
339+
SDNode *N = I++; // Preincrement iterator to avoid invalidation issues.
340+
341+
if (N->getOpcode() != ISD::ADD)
342+
continue;
343+
344+
// Look for (add X1, (and (srl X2, c1), c2)) where c2 is constant with
345+
// leading zeros, followed by consecutive set bits, followed by 1 or 2
346+
// trailing zeros, e.g. 1020.
347+
// Transform the expression to
348+
// (add X1, (shl (and (srl X2, c1), (c2>>tz)), tz)) where tz is the number
349+
// of trailing zeros of c2. The left shift would be folded as an shifter
350+
// operand of 'add' and the 'and' and 'srl' would become a bits extraction
351+
// node (UBFX).
352+
353+
SDValue N0 = N->getOperand(0);
354+
SDValue N1 = N->getOperand(1);
355+
unsigned And_imm = 0;
356+
if (!isOpcWithIntImmediate(N1.getNode(), ISD::AND, And_imm)) {
357+
if (isOpcWithIntImmediate(N0.getNode(), ISD::AND, And_imm))
358+
std::swap(N0, N1);
359+
}
360+
if (!And_imm)
361+
continue;
362+
363+
// Check if the AND mask is an immediate of the form: 000.....1111111100
364+
unsigned TZ = CountTrailingZeros_32(And_imm);
365+
if (TZ != 1 && TZ != 2)
366+
// Be conservative here. Shifter operands aren't always free. e.g. On
367+
// Swift, left shifter operand of 1 / 2 for free but others are not.
368+
// e.g.
369+
// ubfx r3, r1, #16, #8
370+
// ldr.w r3, [r0, r3, lsl #2]
371+
// vs.
372+
// mov.w r9, #1020
373+
// and.w r2, r9, r1, lsr #14
374+
// ldr r2, [r0, r2]
375+
continue;
376+
And_imm >>= TZ;
377+
if (And_imm & (And_imm + 1))
378+
continue;
379+
380+
// Look for (and (srl X, c1), c2).
381+
SDValue Srl = N1.getOperand(0);
382+
unsigned Srl_imm = 0;
383+
if (!isOpcWithIntImmediate(Srl.getNode(), ISD::SRL, Srl_imm) ||
384+
(Srl_imm <= 2))
385+
continue;
386+
387+
// Make sure first operand is not a shifter operand which would prevent
388+
// folding of the left shift.
389+
SDValue CPTmp0;
390+
SDValue CPTmp1;
391+
SDValue CPTmp2;
392+
if (isThumb2) {
393+
if (SelectT2ShifterOperandReg(N0, CPTmp0, CPTmp1))
394+
continue;
395+
} else {
396+
if (SelectImmShifterOperand(N0, CPTmp0, CPTmp1) ||
397+
SelectRegShifterOperand(N0, CPTmp0, CPTmp1, CPTmp2))
398+
continue;
399+
}
400+
401+
// Now make the transformation.
402+
Srl = CurDAG->getNode(ISD::SRL, Srl.getDebugLoc(), MVT::i32,
403+
Srl.getOperand(0),
404+
CurDAG->getConstant(Srl_imm+TZ, MVT::i32));
405+
N1 = CurDAG->getNode(ISD::AND, N1.getDebugLoc(), MVT::i32,
406+
Srl, CurDAG->getConstant(And_imm, MVT::i32));
407+
N1 = CurDAG->getNode(ISD::SHL, N1.getDebugLoc(), MVT::i32,
408+
N1, CurDAG->getConstant(TZ, MVT::i32));
409+
CurDAG->UpdateNodeOperands(N, N0, N1);
410+
}
411+
}
412+
330413
/// hasNoVMLxHazardUse - Return true if it's desirable to select a FP MLA / MLS
331414
/// node. VFP / NEON fp VMLA / VMLS instructions have special RAW hazards (at
332415
/// least on current ARM implementations) which should be avoidded.
@@ -2119,10 +2202,10 @@ SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N,
21192202
if (!Subtarget->hasV6T2Ops())
21202203
return NULL;
21212204

2122-
unsigned Opc = isSigned ? (Subtarget->isThumb() ? ARM::t2SBFX : ARM::SBFX)
2205+
unsigned Opc = isSigned
2206+
? (Subtarget->isThumb() ? ARM::t2SBFX : ARM::SBFX)
21232207
: (Subtarget->isThumb() ? ARM::t2UBFX : ARM::UBFX);
21242208

2125-
21262209
// For unsigned extracts, check for a shift right and mask
21272210
unsigned And_imm = 0;
21282211
if (N->getOpcode() == ISD::AND) {
@@ -2140,7 +2223,29 @@ SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N,
21402223
// Note: The width operand is encoded as width-1.
21412224
unsigned Width = CountTrailingOnes_32(And_imm) - 1;
21422225
unsigned LSB = Srl_imm;
2226+
21432227
SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
2228+
2229+
if ((LSB + Width + 1) == N->getValueType(0).getSizeInBits()) {
2230+
// It's cheaper to use a right shift to extract the top bits.
2231+
if (Subtarget->isThumb()) {
2232+
Opc = isSigned ? ARM::t2ASRri : ARM::t2LSRri;
2233+
SDValue Ops[] = { N->getOperand(0).getOperand(0),
2234+
CurDAG->getTargetConstant(LSB, MVT::i32),
2235+
getAL(CurDAG), Reg0, Reg0 };
2236+
return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops, 5);
2237+
}
2238+
2239+
// ARM models shift instructions as MOVsi with shifter operand.
2240+
ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(ISD::SRL);
2241+
SDValue ShOpc =
2242+
CurDAG->getTargetConstant(ARM_AM::getSORegOpc(ShOpcVal, LSB),
2243+
MVT::i32);
2244+
SDValue Ops[] = { N->getOperand(0).getOperand(0), ShOpc,
2245+
getAL(CurDAG), Reg0, Reg0 };
2246+
return CurDAG->SelectNodeTo(N, ARM::MOVsi, MVT::i32, Ops, 5);
2247+
}
2248+
21442249
SDValue Ops[] = { N->getOperand(0).getOperand(0),
21452250
CurDAG->getTargetConstant(LSB, MVT::i32),
21462251
CurDAG->getTargetConstant(Width, MVT::i32),

test/CodeGen/ARM/bfx.ll

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,3 +26,28 @@ define i32 @ubfx2(i32 %a) {
2626
ret i32 %t2
2727
}
2828

29+
; rdar://12870177
30+
define i32 @ubfx_opt(i32* nocapture %ctx, i32 %x) nounwind readonly ssp {
31+
entry:
32+
; CHECK: ubfx_opt
33+
; CHECK: lsr [[REG1:(lr|r[0-9]+)]], r1, #24
34+
; CHECK: ldr {{lr|r[0-9]+}}, [r0, [[REG1]], lsl #2]
35+
; CHECK: ubfx [[REG2:(lr|r[0-9]+)]], r1, #16, #8
36+
; CHECK: ldr {{lr|r[0-9]+}}, [r0, [[REG2]], lsl #2]
37+
; CHECK: ubfx [[REG3:(lr|r[0-9]+)]], r1, #8, #8
38+
; CHECK: ldr {{lr|r[0-9]+}}, [r0, [[REG3]], lsl #2]
39+
%and = lshr i32 %x, 8
40+
%shr = and i32 %and, 255
41+
%and1 = lshr i32 %x, 16
42+
%shr2 = and i32 %and1, 255
43+
%shr4 = lshr i32 %x, 24
44+
%arrayidx = getelementptr inbounds i32* %ctx, i32 %shr4
45+
%0 = load i32* %arrayidx, align 4
46+
%arrayidx5 = getelementptr inbounds i32* %ctx, i32 %shr2
47+
%1 = load i32* %arrayidx5, align 4
48+
%add = add i32 %1, %0
49+
%arrayidx6 = getelementptr inbounds i32* %ctx, i32 %shr
50+
%2 = load i32* %arrayidx6, align 4
51+
%add7 = add i32 %add, %2
52+
ret i32 %add7
53+
}

0 commit comments

Comments
 (0)