Skip to content

Commit 840d10a

Browse files
committed
[AVR] Custom lower 32-bit shift instructions
32-bit shift instructions were previously expanded using the default SelectionDAG expander, which meant it used 16-bit constant shifts and ORed them together. This works, but is far from optimal. I've optimized 32-bit shifts on AVR using a custom inserter. This is done using three new pseudo-instructions that take the upper and lower bits of the value in two separate 16-bit registers and outputs two 16-bit registers. This is the first commit in a series. When completed, shift instructions will take around 31% less instructions on average for constant 32-bit shifts, and is in all cases equal or better than the old behavior. It also tends to match or outperform avr-gcc: the only cases where avr-gcc does better is when it uses a loop to shift, or when the LLVM register allocator inserts some unnecessary movs. But it even outperforms avr-gcc in some cases where avr-gcc does not use a loop. As a side effect, non-constant 32-bit shifts also become more efficient. For some real-world differences: the build of compiler-rt I use in TinyGo becomes 2.7% smaller and the build of picolibc I use becomes 0.9% smaller. I think picolibc is a better representation of real-world code, but even a ~1% reduction in code size is really significant. The current patch just lays the groundwork. The result is actually a regression in code size. Later patches will use this as a basis to optimize these shift instructions. Differential Revision: https://reviews.llvm.org/D140569
1 parent 2cc30c4 commit 840d10a

File tree

4 files changed

+321
-0
lines changed

4 files changed

+321
-0
lines changed

llvm/lib/Target/AVR/AVRISelLowering.cpp

Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,9 @@ AVRTargetLowering::AVRTargetLowering(const AVRTargetMachine &TM,
8888
setOperationAction(ISD::SRA, MVT::i16, Custom);
8989
setOperationAction(ISD::SHL, MVT::i16, Custom);
9090
setOperationAction(ISD::SRL, MVT::i16, Custom);
91+
setOperationAction(ISD::SRA, MVT::i32, Custom);
92+
setOperationAction(ISD::SHL, MVT::i32, Custom);
93+
setOperationAction(ISD::SRL, MVT::i32, Custom);
9194
setOperationAction(ISD::SHL_PARTS, MVT::i16, Expand);
9295
setOperationAction(ISD::SRA_PARTS, MVT::i16, Expand);
9396
setOperationAction(ISD::SRL_PARTS, MVT::i16, Expand);
@@ -247,10 +250,13 @@ const char *AVRTargetLowering::getTargetNodeName(unsigned Opcode) const {
247250
NODE(CALL);
248251
NODE(WRAPPER);
249252
NODE(LSL);
253+
NODE(LSLW);
250254
NODE(LSR);
255+
NODE(LSRW);
251256
NODE(ROL);
252257
NODE(ROR);
253258
NODE(ASR);
259+
NODE(ASRW);
254260
NODE(LSLLOOP);
255261
NODE(LSRLOOP);
256262
NODE(ROLLOOP);
@@ -279,6 +285,57 @@ SDValue AVRTargetLowering::LowerShifts(SDValue Op, SelectionDAG &DAG) const {
279285
assert(isPowerOf2_32(VT.getSizeInBits()) &&
280286
"Expected power-of-2 shift amount");
281287

288+
if (VT.getSizeInBits() == 32) {
289+
if (!isa<ConstantSDNode>(N->getOperand(1))) {
290+
// 32-bit shifts are converted to a loop in IR.
291+
// This should be unreachable.
292+
report_fatal_error("Expected a constant shift amount!");
293+
}
294+
SDVTList ResTys = DAG.getVTList(MVT::i16, MVT::i16);
295+
SDValue SrcLo =
296+
DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i16, Op.getOperand(0),
297+
DAG.getConstant(0, dl, MVT::i16));
298+
SDValue SrcHi =
299+
DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i16, Op.getOperand(0),
300+
DAG.getConstant(1, dl, MVT::i16));
301+
uint64_t ShiftAmount =
302+
cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
303+
if (ShiftAmount == 16) {
304+
// Special case these two operations because they appear to be used by the
305+
// generic codegen parts to lower 32-bit numbers.
306+
// TODO: perhaps we can lower shift amounts bigger than 16 to a 16-bit
307+
// shift of a part of the 32-bit value?
308+
switch (Op.getOpcode()) {
309+
case ISD::SHL: {
310+
SDValue Zero = DAG.getConstant(0, dl, MVT::i16);
311+
return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i32, Zero, SrcLo);
312+
}
313+
case ISD::SRL: {
314+
SDValue Zero = DAG.getConstant(0, dl, MVT::i16);
315+
return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i32, SrcHi, Zero);
316+
}
317+
}
318+
}
319+
SDValue Cnt = DAG.getTargetConstant(ShiftAmount, dl, MVT::i8);
320+
unsigned Opc;
321+
switch (Op.getOpcode()) {
322+
default:
323+
llvm_unreachable("Invalid 32-bit shift opcode!");
324+
case ISD::SHL:
325+
Opc = AVRISD::LSLW;
326+
break;
327+
case ISD::SRL:
328+
Opc = AVRISD::LSRW;
329+
break;
330+
case ISD::SRA:
331+
Opc = AVRISD::ASRW;
332+
break;
333+
}
334+
SDValue Result = DAG.getNode(Opc, dl, ResTys, SrcLo, SrcHi, Cnt);
335+
return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i32, Result.getValue(0),
336+
Result.getValue(1));
337+
}
338+
282339
// Expand non-constant shifts to loops.
283340
if (!isa<ConstantSDNode>(N->getOperand(1))) {
284341
switch (Op.getOpcode()) {
@@ -1789,6 +1846,114 @@ MachineBasicBlock *AVRTargetLowering::insertShift(MachineInstr &MI,
17891846
return RemBB;
17901847
}
17911848

1849+
// Do a multibyte AVR shift. Insert shift instructions and put the output
1850+
// registers in the Regs array.
1851+
// Because AVR does not have a normal shift instruction (only a single bit shift
1852+
// instruction), we have to emulate this behavior with other instructions.
1853+
static void insertMultibyteShift(MachineInstr &MI, MachineBasicBlock *BB,
1854+
MutableArrayRef<std::pair<Register, int>> Regs,
1855+
ISD::NodeType Opc, int64_t ShiftAmt) {
1856+
const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo();
1857+
MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
1858+
const DebugLoc &dl = MI.getDebugLoc();
1859+
1860+
const bool ShiftLeft = Opc == ISD::SHL;
1861+
const bool ArithmeticShift = Opc == ISD::SRA;
1862+
1863+
// Shift by one. This is the fallback that always works, and the shift
1864+
// operation that is used for 1, 2, and 3 bit shifts.
1865+
while (ShiftLeft && ShiftAmt) {
1866+
// Shift one to the left.
1867+
for (ssize_t I = Regs.size() - 1; I >= 0; I--) {
1868+
Register Out = MRI.createVirtualRegister(&AVR::GPR8RegClass);
1869+
Register In = Regs[I].first;
1870+
Register InSubreg = Regs[I].second;
1871+
if (I == (ssize_t)Regs.size() - 1) { // first iteration
1872+
BuildMI(*BB, MI, dl, TII.get(AVR::ADDRdRr), Out)
1873+
.addReg(In, 0, InSubreg)
1874+
.addReg(In, 0, InSubreg);
1875+
} else {
1876+
BuildMI(*BB, MI, dl, TII.get(AVR::ADCRdRr), Out)
1877+
.addReg(In, 0, InSubreg)
1878+
.addReg(In, 0, InSubreg);
1879+
}
1880+
Regs[I] = std::pair(Out, 0);
1881+
}
1882+
ShiftAmt--;
1883+
}
1884+
while (!ShiftLeft && ShiftAmt) {
1885+
// Shift one to the right.
1886+
for (size_t I = 0; I < Regs.size(); I++) {
1887+
Register Out = MRI.createVirtualRegister(&AVR::GPR8RegClass);
1888+
Register In = Regs[I].first;
1889+
Register InSubreg = Regs[I].second;
1890+
if (I == 0) {
1891+
unsigned Opc = ArithmeticShift ? AVR::ASRRd : AVR::LSRRd;
1892+
BuildMI(*BB, MI, dl, TII.get(Opc), Out).addReg(In, 0, InSubreg);
1893+
} else {
1894+
BuildMI(*BB, MI, dl, TII.get(AVR::RORRd), Out).addReg(In, 0, InSubreg);
1895+
}
1896+
Regs[I] = std::pair(Out, 0);
1897+
}
1898+
ShiftAmt--;
1899+
}
1900+
1901+
if (ShiftAmt != 0) {
1902+
llvm_unreachable("don't know how to shift!"); // sanity check
1903+
}
1904+
}
1905+
1906+
// Do a wide (32-bit) shift.
1907+
MachineBasicBlock *
1908+
AVRTargetLowering::insertWideShift(MachineInstr &MI,
1909+
MachineBasicBlock *BB) const {
1910+
const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
1911+
const DebugLoc &dl = MI.getDebugLoc();
1912+
1913+
// How much to shift to the right (meaning: a negative number indicates a left
1914+
// shift).
1915+
int64_t ShiftAmt = MI.getOperand(4).getImm();
1916+
ISD::NodeType Opc;
1917+
switch (MI.getOpcode()) {
1918+
case AVR::Lsl32:
1919+
Opc = ISD::SHL;
1920+
break;
1921+
case AVR::Lsr32:
1922+
Opc = ISD::SRL;
1923+
break;
1924+
case AVR::Asr32:
1925+
Opc = ISD::SRA;
1926+
break;
1927+
}
1928+
1929+
// Read the input registers, with the most significant register at index 0.
1930+
std::array<std::pair<Register, int>, 4> Registers = {
1931+
std::pair(MI.getOperand(3).getReg(), AVR::sub_hi),
1932+
std::pair(MI.getOperand(3).getReg(), AVR::sub_lo),
1933+
std::pair(MI.getOperand(2).getReg(), AVR::sub_hi),
1934+
std::pair(MI.getOperand(2).getReg(), AVR::sub_lo),
1935+
};
1936+
1937+
// Do the shift. The registers are modified in-place.
1938+
insertMultibyteShift(MI, BB, Registers, Opc, ShiftAmt);
1939+
1940+
// Combine the 8-bit registers into 16-bit register pairs.
1941+
BuildMI(*BB, MI, dl, TII.get(AVR::REG_SEQUENCE), MI.getOperand(1).getReg())
1942+
.addReg(Registers[0].first, 0, Registers[0].second)
1943+
.addImm(AVR::sub_hi)
1944+
.addReg(Registers[1].first, 0, Registers[1].second)
1945+
.addImm(AVR::sub_lo);
1946+
BuildMI(*BB, MI, dl, TII.get(AVR::REG_SEQUENCE), MI.getOperand(0).getReg())
1947+
.addReg(Registers[2].first, 0, Registers[2].second)
1948+
.addImm(AVR::sub_hi)
1949+
.addReg(Registers[3].first, 0, Registers[3].second)
1950+
.addImm(AVR::sub_lo);
1951+
1952+
// Remove the pseudo instruction.
1953+
MI.eraseFromParent();
1954+
return BB;
1955+
}
1956+
17921957
static bool isCopyMulResult(MachineBasicBlock::iterator const &I) {
17931958
if (I->getOpcode() == AVR::COPY) {
17941959
Register SrcReg = I->getOperand(1).getReg();
@@ -1901,6 +2066,10 @@ AVRTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
19012066
case AVR::Asr8:
19022067
case AVR::Asr16:
19032068
return insertShift(MI, MBB);
2069+
case AVR::Lsl32:
2070+
case AVR::Lsr32:
2071+
case AVR::Asr32:
2072+
return insertWideShift(MI, MBB);
19042073
case AVR::MULRdRr:
19052074
case AVR::MULSRdRr:
19062075
return insertMul(MI, MBB);

llvm/lib/Target/AVR/AVRISelLowering.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,14 +39,17 @@ enum NodeType {
3939
LSLBN, ///< Byte logical shift left N bits.
4040
LSLWN, ///< Word logical shift left N bits.
4141
LSLHI, ///< Higher 8-bit of word logical shift left.
42+
LSLW, ///< Wide logical shift left.
4243
LSR, ///< Logical shift right.
4344
LSRBN, ///< Byte logical shift right N bits.
4445
LSRWN, ///< Word logical shift right N bits.
4546
LSRLO, ///< Lower 8-bit of word logical shift right.
47+
LSRW, ///< Wide logical shift right.
4648
ASR, ///< Arithmetic shift right.
4749
ASRBN, ///< Byte arithmetic shift right N bits.
4850
ASRWN, ///< Word arithmetic shift right N bits.
4951
ASRLO, ///< Lower 8-bit of word arithmetic shift right.
52+
ASRW, ///< Wide arithmetic shift right.
5053
ROR, ///< Bit rotate right.
5154
ROL, ///< Bit rotate left.
5255
LSLLOOP, ///< A loop of single logical shift left instructions.
@@ -186,6 +189,8 @@ class AVRTargetLowering : public TargetLowering {
186189

187190
private:
188191
MachineBasicBlock *insertShift(MachineInstr &MI, MachineBasicBlock *BB) const;
192+
MachineBasicBlock *insertWideShift(MachineInstr &MI,
193+
MachineBasicBlock *BB) const;
189194
MachineBasicBlock *insertMul(MachineInstr &MI, MachineBasicBlock *BB) const;
190195
MachineBasicBlock *insertCopyZero(MachineInstr &MI,
191196
MachineBasicBlock *BB) const;

llvm/lib/Target/AVR/AVRInstrInfo.td

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,9 @@ def AVRasrbn : SDNode<"AVRISD::ASRBN", SDTIntBinOp>;
6969
def AVRlslwn : SDNode<"AVRISD::LSLWN", SDTIntBinOp>;
7070
def AVRlsrwn : SDNode<"AVRISD::LSRWN", SDTIntBinOp>;
7171
def AVRasrwn : SDNode<"AVRISD::ASRWN", SDTIntBinOp>;
72+
def AVRlslw : SDNode<"AVRISD::LSLW", SDTIntShiftDOp>;
73+
def AVRlsrw : SDNode<"AVRISD::LSRW", SDTIntShiftDOp>;
74+
def AVRasrw : SDNode<"AVRISD::ASRW", SDTIntShiftDOp>;
7275

7376
// Pseudo shift nodes for non-constant shift amounts.
7477
def AVRlslLoop : SDNode<"AVRISD::LSLLOOP", SDTIntShiftOp>;
@@ -2337,6 +2340,11 @@ def Lsl16 : ShiftPseudo<(outs DREGS
23372340
: $src, i8
23382341
: $cnt))]>;
23392342

2343+
def Lsl32 : ShiftPseudo<(outs DREGS:$dstlo, DREGS:$dsthi),
2344+
(ins DREGS:$srclo, DREGS:$srchi, i8imm:$cnt),
2345+
"# Lsl32 PSEUDO",
2346+
[(set i16:$dstlo, i16:$dsthi, (AVRlslw i16:$srclo, i16:$srchi, i8:$cnt))]>;
2347+
23402348
def Lsr8 : ShiftPseudo<(outs GPR8
23412349
: $dst),
23422350
(ins GPR8
@@ -2357,6 +2365,11 @@ def Lsr16 : ShiftPseudo<(outs DREGS
23572365
: $src, i8
23582366
: $cnt))]>;
23592367

2368+
def Lsr32 : ShiftPseudo<(outs DREGS:$dstlo, DREGS:$dsthi),
2369+
(ins DREGS:$srclo, DREGS:$srchi, i8imm:$cnt),
2370+
"# Lsr32 PSEUDO",
2371+
[(set i16:$dstlo, i16:$dsthi, (AVRlsrw i16:$srclo, i16:$srchi, i8:$cnt))]>;
2372+
23602373
def Rol8 : ShiftPseudo<(outs GPR8
23612374
: $dst),
23622375
(ins GPR8
@@ -2417,6 +2430,11 @@ def Asr16 : ShiftPseudo<(outs DREGS
24172430
: $src, i8
24182431
: $cnt))]>;
24192432

2433+
def Asr32 : ShiftPseudo<(outs DREGS:$dstlo, DREGS:$dsthi),
2434+
(ins DREGS:$srclo, DREGS:$srchi, i8imm:$cnt),
2435+
"# Asr32 PSEUDO",
2436+
[(set i16:$dstlo, i16:$dsthi, (AVRasrw i16:$srclo, i16:$srchi, i8:$cnt))]>;
2437+
24202438
// lowered to a copy from the zero register.
24212439
let usesCustomInserter=1 in
24222440
def CopyZero : Pseudo<(outs GPR8:$rd), (ins), "clrz\t$rd", [(set i8:$rd, 0)]>;

0 commit comments

Comments
 (0)