Skip to content

Commit b88ff31

Browse files
committed
Replace successive "ldr" and "str" instructions with "ldp" and "stp"
This change serves to address the following four Github tickets: 1. ARM64: Optimize pair of "ldr reg, [fp]" to ldp dotnet#35130 2. ARM64: Optimize pair of "ldr reg, [reg]" to ldp dotnet#35132 3. ARM64: Optimize pair of "str reg, [reg]" to stp dotnet#35133 4. ARM64: Optimize pair of "str reg, [fp]" to stp  dotnet#35134 A technique was employed that involved detecting an optimisation opportunity as instruction sequences were being generated. The optimised instruction was then generated on top of the previous instruction, with no second instruction generated. Thus, there were no changes to instruction group size at “emission time” and no changes to jump instructions.
1 parent a17d3d2 commit b88ff31

File tree

2 files changed

+219
-9
lines changed

2 files changed

+219
-9
lines changed

src/coreclr/jit/emitarm64.cpp

Lines changed: 215 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5676,6 +5676,22 @@ void emitter::emitIns_R_R_I(
56765676
{
56775677
return;
56785678
}
5679+
5680+
if (emitComp->opts.OptimizationEnabled() && IsOptimisableLdrStr(ins, reg1, reg2, imm, size, fmt))
5681+
{
5682+
regNumber oldReg1 = emitLastIns->idReg1();
5683+
ssize_t oldImm =
5684+
emitLastIns->idIsLargeCns() ? ((instrDescCns*)emitLastIns)->idcCnsVal : emitLastIns->idSmallCns();
5685+
instruction optIns = (ins == INS_ldr) ? INS_ldp : INS_stp;
5686+
ssize_t scaledOldImm = oldImm * size;
5687+
5688+
// Overwrite the "sub-optimal" instruction with the *optimised* instruction, directly
5689+
// into the output buffer.
5690+
emitIns_R_R_R_I(optIns, attr, oldReg1, reg1, reg2, scaledOldImm, INS_OPTS_NONE, EA_UNKNOWN, emitLastIns);
5691+
5692+
// And now stop here, as the second instruction descriptor is no longer emitted.
5693+
return;
5694+
}
56795695
}
56805696
else if (isAddSub)
56815697
{
@@ -6491,7 +6507,8 @@ void emitter::emitIns_R_R_R_I(instruction ins,
64916507
regNumber reg3,
64926508
ssize_t imm,
64936509
insOpts opt /* = INS_OPTS_NONE */,
6494-
emitAttr attrReg2 /* = EA_UNKNOWN */)
6510+
emitAttr attrReg2 /* = EA_UNKNOWN */,
6511+
instrDesc* reuseInstr /* = nullptr */)
64956512
{
64966513
emitAttr size = EA_SIZE(attr);
64976514
emitAttr elemsize = EA_UNKNOWN;
@@ -6626,6 +6643,7 @@ void emitter::emitIns_R_R_R_I(instruction ins,
66266643
scale = (size == EA_8BYTE) ? 3 : 2;
66276644
}
66286645
isLdSt = true;
6646+
fmt = IF_LS_3C;
66296647
break;
66306648

66316649
case INS_ld1:
@@ -6906,7 +6924,58 @@ void emitter::emitIns_R_R_R_I(instruction ins,
69066924
}
69076925
assert(fmt != IF_NONE);
69086926

6909-
instrDesc* id = emitNewInstrCns(attr, imm);
6927+
// An "instrDesc" will *always* be required.
6928+
// Under normal circumstances the instruction
6929+
// will be added to the emitted group. However,
6930+
// this is not correct for instructions that
6931+
// are going to overwrite already-emitted
6932+
// instructions and we therefore need space to
6933+
// hold the new instruction descriptor.
6934+
instrDesc* id;
6935+
6936+
// One cannot simply instantiate an instruction
6937+
// descriptor, so this array will be used to
6938+
// hold the instruction being built.
6939+
unsigned char tempInstrDesc[sizeof(instrDesc)];
6940+
6941+
// Now the instruction is either emitted OR
6942+
// used to overwrite the previously-emitted
6943+
// instruction.
6944+
if (reuseInstr == nullptr)
6945+
{
6946+
id = emitNewInstrCns(attr, imm);
6947+
}
6948+
else
6949+
{
6950+
id = (instrDesc*)tempInstrDesc;
6951+
6952+
memset(id, 0, sizeof(tempInstrDesc));
6953+
6954+
// Store the size and handle the two special
6955+
// values that indicate GCref and ByRef
6956+
6957+
if (EA_IS_GCREF(attr))
6958+
{
6959+
// A special value indicates a GCref pointer value
6960+
6961+
id->idGCref(GCT_GCREF);
6962+
id->idOpSize(EA_PTRSIZE);
6963+
}
6964+
else if (EA_IS_BYREF(attr))
6965+
{
6966+
// A special value indicates a Byref pointer value
6967+
6968+
id->idGCref(GCT_BYREF);
6969+
id->idOpSize(EA_PTRSIZE);
6970+
}
6971+
else
6972+
{
6973+
id->idGCref(GCT_NONE);
6974+
id->idOpSize(EA_SIZE(attr));
6975+
}
6976+
6977+
id->idSmallCns(imm);
6978+
}
69106979

69116980
id->idIns(ins);
69126981
id->idInsFmt(fmt);
@@ -6932,8 +7001,18 @@ void emitter::emitIns_R_R_R_I(instruction ins,
69327001
}
69337002
}
69347003

6935-
dispIns(id);
6936-
appendToCurIG(id);
7004+
// Now the instruction is EITHER emitted OR used to overwrite the previously-emitted instruction.
7005+
if (reuseInstr == nullptr)
7006+
{
7007+
// Then this is the standard exit path and the instruction is to be appended to the instruction group.
7008+
dispIns(id);
7009+
appendToCurIG(id);
7010+
}
7011+
else
7012+
{
7013+
// The instruction is copied over the last emitted insdtruction.
7014+
memcpy(reuseInstr, id, sizeof(tempInstrDesc));
7015+
}
69377016
}
69387017

69397018
/*****************************************************************************
@@ -7623,8 +7702,7 @@ void emitter::emitIns_R_S(instruction ins, emitAttr attr, regNumber reg1, int va
76237702
{
76247703
bool useRegForImm = false;
76257704
ssize_t mask = (1 << scale) - 1; // the mask of low bits that must be zero to encode the immediate
7626-
7627-
imm = disp;
7705+
imm = disp;
76287706
if (imm == 0)
76297707
{
76307708
fmt = IF_LS_2A;
@@ -7670,6 +7748,25 @@ void emitter::emitIns_R_S(instruction ins, emitAttr attr, regNumber reg1, int va
76707748

76717749
assert(fmt != IF_NONE);
76727750

7751+
// This handles LDR duplicate instructions
7752+
if (emitComp->opts.OptimizationEnabled() && IsOptimisableLdrStr(ins, reg1, reg2, imm, size, fmt))
7753+
{
7754+
regNumber oldReg1 = emitLastIns->idReg1();
7755+
ssize_t oldImm =
7756+
emitLastIns->idIsLargeCns() ? ((instrDescCns*)emitLastIns)->idcCnsVal : emitLastIns->idSmallCns();
7757+
instruction optIns = (ins == INS_ldr) ? INS_ldp : INS_stp;
7758+
ssize_t scaledOldImm = oldImm * size;
7759+
7760+
// Overwrite the "sub-optimal" instruction with the *optimised* instruction, directly
7761+
// into the output buffer.
7762+
emitIns_R_R_R_I(optIns, attr, oldReg1, reg1, reg2, scaledOldImm, INS_OPTS_NONE, EA_UNKNOWN, emitLastIns);
7763+
7764+
// And now stop here, as the second instruction descriptor is no longer emitted.
7765+
return;
7766+
}
7767+
7768+
// We need to simply emit the instruction unchanged
7769+
76737770
instrDesc* id = emitNewInstrCns(attr, imm);
76747771

76757772
id->idIns(ins);
@@ -7901,6 +7998,22 @@ void emitter::emitIns_S_R(instruction ins, emitAttr attr, regNumber reg1, int va
79017998

79027999
assert(fmt != IF_NONE);
79038000

8001+
if (emitComp->opts.OptimizationEnabled() && IsOptimisableLdrStr(ins, reg1, reg2, imm, size, fmt))
8002+
{
8003+
regNumber oldReg1 = emitLastIns->idReg1();
8004+
ssize_t oldImm =
8005+
emitLastIns->idIsLargeCns() ? ((instrDescCns*)emitLastIns)->idcCnsVal : emitLastIns->idSmallCns();
8006+
instruction optIns = (ins == INS_ldr) ? INS_ldp : INS_stp;
8007+
ssize_t scaledOldImm = oldImm * size;
8008+
8009+
// Overwrite the "sub-optimal" instruction with the *optimised* instruction, directly
8010+
// into the output buffer.
8011+
emitIns_R_R_R_I(optIns, attr, oldReg1, reg1, reg2, scaledOldImm, INS_OPTS_NONE, EA_UNKNOWN, emitLastIns);
8012+
8013+
// And now stop here, as the second instruction descriptor is no longer emitted.
8014+
return;
8015+
}
8016+
79048017
instrDesc* id = emitNewInstrCns(attr, imm);
79058018

79068019
id->idIns(ins);
@@ -16128,4 +16241,100 @@ bool emitter::IsRedundantLdStr(
1612816241

1612916242
return false;
1613016243
}
16244+
16245+
//-----------------------------------------------------------------------------------
16246+
// IsOptimisableLdrStr: Check if it is possible to optimise two "ldr" or "str"
16247+
// instructions into a single "ldp" or "stp" instruction.
16248+
//
16249+
// Arguments:
16250+
// ins - The instruction code
16251+
// reg1 - Register 1 number
16252+
// reg2 - Register 2 number
16253+
// imm - Immediate offset, prior to scaling by operand size
16254+
// size - Operand size
16255+
// fmt - Instruction format
16256+
//
16257+
16258+
bool emitter::IsOptimisableLdrStr(
16259+
instruction ins, regNumber reg1, regNumber reg2, ssize_t imm, emitAttr size, insFormat fmt)
16260+
{
16261+
bool isFirstInstrInBlock = (emitCurIGinsCnt == 0) && ((emitCurIG->igFlags & IGF_EXTEND) == 0);
16262+
16263+
if (((ins != INS_ldr) && (ins != INS_str)) || (isFirstInstrInBlock) || (emitLastIns == nullptr))
16264+
{
16265+
return false;
16266+
}
16267+
16268+
regNumber prevReg1 = emitLastIns->idReg1();
16269+
regNumber prevReg2 = emitLastIns->idReg2();
16270+
insFormat lastInsFmt = emitLastIns->idInsFmt();
16271+
emitAttr prevSize = emitLastIns->idOpSize();
16272+
ssize_t prevImm = emitLastIns->idIsLargeCns() ? ((instrDescCns*)emitLastIns)->idcCnsVal : emitLastIns->idSmallCns();
16273+
16274+
// Signed, *raw* immediate value fits in 7 bits, so
16275+
// for LDP/ STP the raw value is from -64 to +63.
16276+
// For LDR/ STR, there are 9 bits, so we need to
16277+
// limit the range explicitly in software.
16278+
if ((imm < -64) || (imm > 63) || (prevImm < -64) || (prevImm > 63))
16279+
{
16280+
// Then one or more of the immediate values is
16281+
// out of range, so we cannot optimise.
16282+
return false;
16283+
}
16284+
16285+
if ((!isGeneralRegisterOrZR(reg1)) || (!isGeneralRegisterOrZR(prevReg1)))
16286+
{
16287+
// Either register 1 is not a general register
16288+
// or previous register 1 is not a general register
16289+
// or the zero register, so we cannot optimise.
16290+
return false;
16291+
}
16292+
16293+
if (!((ins == emitLastIns->idIns()) && (ins == INS_ldr || ins == INS_str)))
16294+
{
16295+
// Not successive ldr or str instructions
16296+
return false;
16297+
}
16298+
16299+
if (lastInsFmt != fmt)
16300+
{
16301+
// The formats of the two instructions differ.
16302+
return false;
16303+
}
16304+
16305+
if ((emitInsIsLoad(ins)) && (reg1 == prevReg1))
16306+
{
16307+
// Cannot load to the same register twice.
16308+
return false;
16309+
}
16310+
16311+
if (prevSize != size)
16312+
{
16313+
// Operand sizes differ.
16314+
return false;
16315+
}
16316+
16317+
if (imm != (prevImm + 1))
16318+
{
16319+
// Not consecutive immediate values.
16320+
return false;
16321+
}
16322+
16323+
if (emitSizeOfInsDsc(emitLastIns) != sizeof(instrDesc))
16324+
{
16325+
// Not instruction descriptors of the
16326+
// same, standard size.
16327+
return false;
16328+
}
16329+
16330+
if (!((reg2 == prevReg2) && isGeneralRegisterOrSP(reg2)))
16331+
{
16332+
// The "register 2" numbers need to be
16333+
// the same AND general registers or
16334+
// the stack pointer.
16335+
return false;
16336+
}
16337+
return true;
16338+
}
16339+
1613116340
#endif // defined(TARGET_ARM64)

src/coreclr/jit/emitarm64.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ static UINT64 Replicate_helper(UINT64 value, unsigned width, emitAttr size);
112112
static bool IsMovInstruction(instruction ins);
113113
bool IsRedundantMov(instruction ins, emitAttr size, regNumber dst, regNumber src, bool canSkip);
114114
bool IsRedundantLdStr(instruction ins, regNumber reg1, regNumber reg2, ssize_t imm, emitAttr size, insFormat fmt);
115-
115+
bool IsOptimisableLdrStr(instruction ins, regNumber reg1, regNumber reg2, ssize_t imm, emitAttr size, insFormat fmt);
116116
/************************************************************************
117117
*
118118
* This union is used to to encode/decode the special ARM64 immediate values
@@ -775,8 +775,9 @@ void emitIns_R_R_R_I(instruction ins,
775775
regNumber reg2,
776776
regNumber reg3,
777777
ssize_t imm,
778-
insOpts opt = INS_OPTS_NONE,
779-
emitAttr attrReg2 = EA_UNKNOWN);
778+
insOpts opt = INS_OPTS_NONE,
779+
emitAttr attrReg2 = EA_UNKNOWN,
780+
instrDesc* reuseInstr = nullptr);
780781

781782
void emitIns_R_R_R_Ext(instruction ins,
782783
emitAttr attr,

0 commit comments

Comments
 (0)