Skip to content

Adding push2/pop2 #116035

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jun 24, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions src/coreclr/jit/codegen.h
Original file line number Diff line number Diff line change
Expand Up @@ -560,6 +560,10 @@ class CodeGen final : public CodeGenInterface

#if defined(TARGET_XARCH)
unsigned genPopCalleeSavedRegistersFromMask(regMaskTP rsPopRegs);
#ifdef TARGET_AMD64
void genPushCalleeSavedRegistersFromMaskAPX(regMaskTP rsPushRegs);
unsigned genPopCalleeSavedRegistersFromMaskAPX(regMaskTP rsPopRegs);
#endif // TARGET_AMD64
#endif // !defined(TARGET_XARCH)

#endif // !defined(TARGET_ARM64)
Expand Down Expand Up @@ -1587,6 +1591,10 @@ class CodeGen final : public CodeGenInterface
insFlags flags = INS_FLAGS_DONT_CARE DEBUGARG(size_t targetHandle = 0)
DEBUGARG(GenTreeFlags gtFlags = GTF_EMPTY));

#if defined(TARGET_AMD64)
void instGen_Push2Pop2Ppx(instruction ins, regNumber reg1, regNumber reg2);
#endif // defined(TARGET_AMD64)

#ifdef TARGET_XARCH
instruction genMapShiftInsToShiftByConstantIns(instruction ins, int shiftByValue);
#endif // TARGET_XARCH
Expand Down
167 changes: 166 additions & 1 deletion src/coreclr/jit/codegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9404,6 +9404,13 @@ void CodeGen::genAmd64EmitterUnitTestsApx()
theEmitter->emitIns_R_R_R(INS_pext, EA_4BYTE, REG_R16, REG_R18, REG_R17);
theEmitter->emitIns_R_R_R(INS_pext, EA_8BYTE, REG_R16, REG_R18, REG_R17);

theEmitter->emitIns_R_R(INS_push2, EA_PTRSIZE, REG_R17, REG_R18, (insOpts)(INS_OPTS_EVEX_nd | INS_OPTS_APX_ppx));
theEmitter->emitIns_R_R(INS_pop2, EA_PTRSIZE, REG_R17, REG_R18, (insOpts)(INS_OPTS_EVEX_nd | INS_OPTS_APX_ppx));
theEmitter->emitIns_R(INS_push, EA_PTRSIZE, REG_R11, INS_OPTS_APX_ppx);
theEmitter->emitIns_R(INS_pop, EA_PTRSIZE, REG_R11, INS_OPTS_APX_ppx);
theEmitter->emitIns_R(INS_push, EA_PTRSIZE, REG_R17, INS_OPTS_APX_ppx);
theEmitter->emitIns_R(INS_pop, EA_PTRSIZE, REG_R17, INS_OPTS_APX_ppx);

theEmitter->emitIns_Mov(INS_movd32, EA_4BYTE, REG_R16, REG_XMM0, false);
theEmitter->emitIns_Mov(INS_movd32, EA_4BYTE, REG_R16, REG_XMM16, false);
}
Expand Down Expand Up @@ -10257,7 +10264,6 @@ void CodeGen::genOSRSaveRemainingCalleeSavedRegisters()
osrAdditionalIntCalleeSaves &= ~regBit;
}
}

#endif // TARGET_AMD64

//------------------------------------------------------------------------
Expand Down Expand Up @@ -10307,6 +10313,14 @@ void CodeGen::genPushCalleeSavedRegisters()
}
#endif // DEBUG

#ifdef TARGET_AMD64
if (compiler->canUseApxEncoding() && compiler->canUseEvexEncoding() && JitConfig.EnableApxPPX())
{
genPushCalleeSavedRegistersFromMaskAPX(rsPushRegs);
return;
}
#endif // TARGET_AMD64

// Push backwards so we match the order we will pop them in the epilog
// and all the other code that expects it to be in this order.
for (regNumber reg = get_REG_INT_LAST(); rsPushRegs != RBM_NONE; reg = REG_PREV(reg))
Expand All @@ -10322,6 +10336,77 @@ void CodeGen::genPushCalleeSavedRegisters()
}
}

#if defined(TARGET_AMD64)
//------------------------------------------------------------------------
// genPushCalleeSavedRegistersFromMaskAPX: push specified set of callee saves
// in the "standard" order using Push2 when possible
//
// Arguments:
// rsPushRegs - register mask of registers to push
//
// Return Value:
// The number of registers popped.
//
void CodeGen::genPushCalleeSavedRegistersFromMaskAPX(regMaskTP rsPushRegs)
{
// This is not a funclet or an On-Stack Replacement.
assert((compiler->funCurrentFunc()->funKind == FuncKind::FUNC_ROOT) && !compiler->opts.IsOSR());
// PUSH2 doesn't work for ESP.
assert((rsPushRegs & RBM_SPBASE) == 0);
// We need to align the stack to 16 bytes to use push2/pop2.
// The ABI requirement is that the stack must be 16B aligned at the point of a function call.
// As soon as the CALL is executed, the stack is no longer 16B aligned.
// To use PP2, the stack needs to be pre-aligned
// If isFramePointerUsed() is true, we have already pushed the frame pointer and stack is aligned.
// Else, We need to issue a single push to align the stack.
if (!isFramePointerUsed() && (rsPushRegs != RBM_NONE))
{
if ((rsPushRegs & RBM_FPBASE) != 0)
{
GetEmitter()->emitIns_R(INS_push, EA_PTRSIZE, REG_EBP, INS_OPTS_APX_ppx);
compiler->unwindPush(REG_EBP);
rsPushRegs &= ~RBM_FPBASE;
}
else
{
regNumber alignReg = genFirstRegNumFromMaskAndToggle(rsPushRegs);
GetEmitter()->emitIns_R(INS_push, EA_PTRSIZE, alignReg, INS_OPTS_APX_ppx);
compiler->unwindPush(alignReg);
}
}

// Push backwards so we match the order we will pop them in the epilog
// and all the other code that expects it to be in this order.
// All registers to be saved as pushed to an ArrayStack
ArrayStack<regNumber> regStack(compiler->getAllocator(CMK_Codegen));
while (rsPushRegs != RBM_NONE)
{
regNumber reg = genFirstRegNumFromMaskAndToggle(rsPushRegs);
regStack.Push(reg);
}

// We need to push the registers in pairs.
// In cases where we have an odd number of registers, we need to push the last one
// separately at the end to maintain alignment for push2.
while (regStack.Height() > 1)
{
regNumber reg1 = regStack.Pop();
regNumber reg2 = regStack.Pop();

GetEmitter()->emitIns_R_R(INS_push2, EA_PTRSIZE, reg1, reg2, (insOpts)(INS_OPTS_EVEX_nd | INS_OPTS_APX_ppx));
compiler->unwindPush2(reg1, reg2);
}

if (regStack.Height() == 1)
{
regNumber reg = regStack.Pop();
GetEmitter()->emitIns_R(INS_push, EA_PTRSIZE, reg, INS_OPTS_APX_ppx);
compiler->unwindPush(reg);
}
assert(regStack.Height() == 0);
}
#endif // TARGET_AMD64

void CodeGen::genPopCalleeSavedRegisters(bool jmpEpilog)
{
assert(compiler->compGeneratingEpilog);
Expand Down Expand Up @@ -10355,6 +10440,14 @@ void CodeGen::genPopCalleeSavedRegisters(bool jmpEpilog)
return;
}

if (compiler->canUseApxEncoding() && compiler->canUseEvexEncoding() && JitConfig.EnableApxPPX())
{
regMaskTP rsPopRegs = regSet.rsGetModifiedIntCalleeSavedRegsMask();
const unsigned popCount = genPopCalleeSavedRegistersFromMaskAPX(rsPopRegs);
noway_assert(compiler->compCalleeRegsPushed == popCount);
return;
}

#endif // TARGET_AMD64

// Registers saved by a normal prolog
Expand Down Expand Up @@ -10430,6 +10523,78 @@ unsigned CodeGen::genPopCalleeSavedRegistersFromMask(regMaskTP rsPopRegs)
return popCount;
}

#if defined(TARGET_AMD64)
//------------------------------------------------------------------------
// genPopCalleeSavedRegistersFromMaskAPX: pop specified set of callee saves
// in the "standard" order using Pop2 when possible
//
// Arguments:
// rsPopRegs - register mask of registers to pop
//
// Return Value:
// The number of registers popped.
//
unsigned CodeGen::genPopCalleeSavedRegistersFromMaskAPX(regMaskTP rsPopRegs)
{
// This is not a funclet or an On-Stack Replacement.
assert((compiler->funCurrentFunc()->funKind == FuncKind::FUNC_ROOT) && !compiler->opts.IsOSR());
unsigned popCount = 0;
// POP2 doesn't work for ESP.
assert((rsPopRegs & RBM_SPBASE) == 0);
regNumber alignReg = REG_NA;
// We need to align the stack to 16 bytes to use push2/pop2.
// If isFramePointerUsed() is true, we will pop the frame pointer and stack will be aligned.
// Else, We need to issue a single pop after the last pop2 to align the stack.
if (!isFramePointerUsed() && (rsPopRegs != RBM_NONE))
{
if ((rsPopRegs & RBM_FPBASE) != 0)
{
alignReg = REG_EBP;
rsPopRegs &= ~RBM_FPBASE;
}
else
{
alignReg = genFirstRegNumFromMaskAndToggle(rsPopRegs);
}
}

// All registers to be restored as pushed to an ArrayStack
ArrayStack<regNumber> regStack(compiler->getAllocator(CMK_Codegen));
while (rsPopRegs != RBM_NONE)
{
regNumber reg = genFirstRegNumFromMaskAndToggle(rsPopRegs);
regStack.Push(reg);
}

int index = 0;
if (regStack.Height() % 2 == 1)
{
// We have an odd number of registers to pop, so we need to pop the last one
// separately..
regNumber reg = regStack.Bottom(index++);
GetEmitter()->emitIns_R(INS_pop, EA_PTRSIZE, reg, INS_OPTS_APX_ppx);
popCount++;
}

while (index < (regStack.Height() - 1))
{
regNumber reg1 = regStack.Bottom(index++);
regNumber reg2 = regStack.Bottom(index++);
GetEmitter()->emitIns_R_R(INS_pop2, EA_PTRSIZE, reg1, reg2, (insOpts)(INS_OPTS_EVEX_nd | INS_OPTS_APX_ppx));
popCount += 2;
}
assert(regStack.Height() == index);

if (alignReg != REG_NA)
{
GetEmitter()->emitIns_R(INS_pop, EA_PTRSIZE, alignReg, INS_OPTS_APX_ppx);
popCount++;
}

return popCount;
}
#endif // defined(TARGET_AMD64)

/*****************************************************************************
*
* Generates code for a function epilog.
Expand Down
5 changes: 5 additions & 0 deletions src/coreclr/jit/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -8805,6 +8805,9 @@ class Compiler
//

void unwindPush(regNumber reg);
#if defined(TARGET_AMD64)
void unwindPush2(regNumber reg1, regNumber reg2);
#endif // TARGET_AMD64
void unwindAllocStack(unsigned size);
void unwindSetFrameReg(regNumber reg, unsigned offset);
void unwindSaveReg(regNumber reg, unsigned offset);
Expand Down Expand Up @@ -8877,6 +8880,7 @@ class Compiler

void unwindBegPrologWindows();
void unwindPushWindows(regNumber reg);
void unwindPush2Windows(regNumber reg1, regNumber reg2);
void unwindAllocStackWindows(unsigned size);
void unwindSetFrameRegWindows(regNumber reg, unsigned offset);
void unwindSaveRegWindows(regNumber reg, unsigned offset);
Expand All @@ -8895,6 +8899,7 @@ class Compiler
short mapRegNumToDwarfReg(regNumber reg);
void createCfiCode(FuncInfoDsc* func, UNATIVE_OFFSET codeOffset, UCHAR opcode, short dwarfReg, INT offset = 0);
void unwindPushPopCFI(regNumber reg);
void unwindPush2Pop2CFI(regNumber reg1, regNumber reg2);
void unwindBegPrologCFI();
void unwindPushPopMaskCFI(regMaskTP regMask, bool isFloat);
void unwindAllocStackCFI(unsigned size);
Expand Down
25 changes: 25 additions & 0 deletions src/coreclr/jit/emit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3782,6 +3782,31 @@ const IS_INFO emitter::emitGetSchedInfo(insFormat insFmt)
assert(!"Unsupported insFmt");
return IS_NONE;
}

//------------------------------------------------------------------------
// HasApxPpx: Check if the instruction has PPX feature support.
// This helps differentiate between _idApxPpxContext and _idNoApxEvexXPromotion
// since we use the same bit to indicate both features.
//
// Arguments:
// ins - instruction for which to check PPX support
//
// Return Value:
// true if the instruction has PPX support, false otherwise.
//
bool emitter::HasApxPpx(instruction ins)
{
switch (ins)
{
case INS_push:
case INS_pop:
case INS_push2:
case INS_pop2:
return true;
default:
return false;
}
}
#endif // TARGET_XARCH

//------------------------------------------------------------------------
Expand Down
23 changes: 19 additions & 4 deletions src/coreclr/jit/emit.h
Original file line number Diff line number Diff line change
Expand Up @@ -853,11 +853,14 @@ class emitter
// We repurpose 4 bits for the default flag value bits for ccmp instructions.
#define _idEvexDFV (_idCustom4 << 3) | (_idCustom3 << 2) | (_idCustom2 << 1) | _idCustom1

// In certian cases, we do not allow instructions to be promoted to APX-EVEX.
unsigned _idCustom7 : 1;
// In certain cases, we do not allow instructions to be promoted to APX-EVEX.
// e.g. instructions like add/and/or/inc/dec can be used with LOCK prefix, but cannot be prefixed by LOCK and
// EVEX together.
unsigned _idNoApxEvexXPromotion : 1;
#endif // TARGET_XARCH
#define _idNoApxEvexXPromotion _idCustom7
// We repurpose _idCustom7 for the APX-EVEX.ppx context for Push/Pop/Push2/Pop2.
#define _idApxPpxContext _idCustom7 /* bits used for the APX-EVEX.ppx context for Push/Pop/Push2/Pop2 */
#endif // TARGET_XARCH

#ifdef TARGET_ARM64
unsigned _idLclVar : 1; // access a local on stack
Expand Down Expand Up @@ -1801,9 +1804,20 @@ class emitter
_idEvexNfContext = 1;
}

bool idIsApxPpxContextSet() const
{
return (_idApxPpxContext != 0) && (HasApxPpx(_idIns));
}

void idSetApxPpxContext()
{
assert(!idIsApxPpxContextSet());
_idApxPpxContext = 1;
}

bool idIsNoApxEvexPromotion() const
{
return _idNoApxEvexXPromotion != 0;
return (_idNoApxEvexXPromotion != 0) && !(HasApxPpx(_idIns));
}

void idSetNoApxEvexPromotion()
Expand Down Expand Up @@ -2380,6 +2394,7 @@ class emitter
int emitGetInsCDinfo(instrDesc* id);

static const IS_INFO emitGetSchedInfo(insFormat f);
static bool HasApxPpx(instruction ins);
#endif // TARGET_XARCH

cnsval_ssize_t emitGetInsSC(const instrDesc* id) const;
Expand Down
Loading
Loading