Skip to content

Commit bc1a872

Browse files
author
Aman Khalid
authored
Enable fake hot/cold splitting on ARM64 (#70708)
This commit contains fixes for various bugs exposed by enabling fake hot/cold splitting on ARM64: - Branches between hot/cold sections are now always long. - The pseudoinstruction for loading a constant from the cold section did not support loading 16-byte data into vector registers, as it temporarily loaded the constant into an 8-byte integer register. Now, 16-byte constants are loaded directly into vector registers via an `ld1` instruction. - Asserts/NYIs blocking hot/cold splitting on ARM64 have been removed. Fake hot/cold splitting requires we fake unwind info by treating each split function as one hot section. A more architecture-agnostic approach for this has been applied. To facilitate this approach, the fake-splitting implementation has been revised to place the hot and cold sections contiguously in memory (immediately followed by the read-only data section on ARM64).
1 parent f310367 commit bc1a872

File tree

10 files changed

+330
-242
lines changed

10 files changed

+330
-242
lines changed

src/coreclr/jit/compiler.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3199,10 +3199,10 @@ void Compiler::compInitOptions(JitFlags* jitFlags)
31993199

32003200
opts.compProcedureSplitting = jitFlags->IsSet(JitFlags::JIT_FLAG_PROCSPLIT) || enableFakeSplitting;
32013201

3202-
#ifdef TARGET_ARM64
3203-
// TODO-ARM64-NYI: enable hot/cold splitting
3202+
#ifdef TARGET_LOONGARCH64
3203+
// Hot/cold splitting is not being tested on LoongArch64.
32043204
opts.compProcedureSplitting = false;
3205-
#endif // TARGET_ARM64
3205+
#endif // TARGET_LOONGARCH64
32063206

32073207
#ifdef DEBUG
32083208
opts.compProcedureSplittingEH = opts.compProcedureSplitting;

src/coreclr/jit/compiler.h

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7660,7 +7660,7 @@ class Compiler
76607660

76617661
// ICorJitInfo wrappers
76627662

7663-
void eeAllocMem(AllocMemArgs* args);
7663+
void eeAllocMem(AllocMemArgs* args, const UNATIVE_OFFSET roDataSectionAlignment);
76647664

76657665
void eeReserveUnwindInfo(bool isFunclet, bool isColdCode, ULONG unwindSize);
76667666

@@ -8017,10 +8017,6 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
80178017
void unwindReserveFuncHelper(FuncInfoDsc* func, bool isHotCode);
80188018
void unwindEmitFuncHelper(FuncInfoDsc* func, void* pHotCode, void* pColdCode, bool isHotCode);
80198019

8020-
#ifdef DEBUG
8021-
void fakeUnwindEmitFuncHelper(FuncInfoDsc* func, void* pHotCode);
8022-
#endif // DEBUG
8023-
80248020
#endif // TARGET_AMD64 || (TARGET_X86 && FEATURE_EH_FUNCLETS)
80258021

80268022
UNATIVE_OFFSET unwindGetCurrentOffset(FuncInfoDsc* func);

src/coreclr/jit/ee_il_dll.cpp

Lines changed: 46 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1122,34 +1122,64 @@ void Compiler::eeDispLineInfos()
11221122
* (e.g., host AMD64, target ARM64), then VM will get confused anyway.
11231123
*/
11241124

1125-
void Compiler::eeAllocMem(AllocMemArgs* args)
1125+
void Compiler::eeAllocMem(AllocMemArgs* args, const UNATIVE_OFFSET roDataSectionAlignment)
11261126
{
11271127
#ifdef DEBUG
1128-
const UNATIVE_OFFSET hotSizeRequest = args->hotCodeSize;
1129-
const UNATIVE_OFFSET coldSizeRequest = args->coldCodeSize;
11301128

1131-
// Fake splitting implementation: place hot/cold code in contiguous section
1132-
if (JitConfig.JitFakeProcedureSplitting() && (coldSizeRequest > 0))
1129+
// Fake splitting implementation: place hot/cold code in contiguous section.
1130+
UNATIVE_OFFSET coldCodeOffset = 0;
1131+
if (JitConfig.JitFakeProcedureSplitting() && (args->coldCodeSize > 0))
11331132
{
1134-
args->hotCodeSize = hotSizeRequest + coldSizeRequest;
1133+
coldCodeOffset = args->hotCodeSize;
1134+
assert(coldCodeOffset > 0);
1135+
args->hotCodeSize += args->coldCodeSize;
11351136
args->coldCodeSize = 0;
11361137
}
1137-
#endif
1138+
1139+
#endif // DEBUG
1140+
1141+
#if defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64)
1142+
1143+
// For arm64/LoongArch64, we want to allocate JIT data always adjacent to code similar to what native compiler does.
1144+
// This way allows us to use a single `ldr` to access such data like float constant/jmp table.
1145+
// For LoongArch64 using `pcaddi + ld` to access such data.
1146+
1147+
UNATIVE_OFFSET roDataAlignmentDelta = 0;
1148+
if (args->roDataSize > 0)
1149+
{
1150+
roDataAlignmentDelta = AlignmentPad(args->hotCodeSize, roDataSectionAlignment);
1151+
}
1152+
1153+
const UNATIVE_OFFSET roDataOffset = args->hotCodeSize + roDataAlignmentDelta;
1154+
args->hotCodeSize = roDataOffset + args->roDataSize;
1155+
args->roDataSize = 0;
1156+
1157+
#endif // defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64)
11381158

11391159
info.compCompHnd->allocMem(args);
11401160

11411161
#ifdef DEBUG
1142-
if (JitConfig.JitFakeProcedureSplitting() && (coldSizeRequest > 0))
1143-
{
1144-
// Fix up hot/cold code pointers
1145-
args->coldCodeBlock = ((BYTE*)args->hotCodeBlock) + hotSizeRequest;
1146-
args->coldCodeBlockRW = ((BYTE*)args->hotCodeBlockRW) + hotSizeRequest;
11471162

1148-
// Reset args' hot/cold code sizes in case caller reads them later
1149-
args->hotCodeSize = hotSizeRequest;
1150-
args->coldCodeSize = coldSizeRequest;
1163+
if (JitConfig.JitFakeProcedureSplitting() && (coldCodeOffset > 0))
1164+
{
1165+
// Fix up cold code pointers. Cold section is adjacent to hot section.
1166+
assert(args->coldCodeBlock == nullptr);
1167+
assert(args->coldCodeBlockRW == nullptr);
1168+
args->coldCodeBlock = ((BYTE*)args->hotCodeBlock) + coldCodeOffset;
1169+
args->coldCodeBlockRW = ((BYTE*)args->hotCodeBlockRW) + coldCodeOffset;
11511170
}
1152-
#endif
1171+
1172+
#endif // DEBUG
1173+
1174+
#if defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64)
1175+
1176+
// Fix up data section pointers.
1177+
assert(args->roDataBlock == nullptr);
1178+
assert(args->roDataBlockRW == nullptr);
1179+
args->roDataBlock = ((BYTE*)args->hotCodeBlock) + roDataOffset;
1180+
args->roDataBlockRW = ((BYTE*)args->hotCodeBlockRW) + roDataOffset;
1181+
1182+
#endif // defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64)
11531183
}
11541184

11551185
void Compiler::eeReserveUnwindInfo(bool isFunclet, bool isColdCode, ULONG unwindSize)

src/coreclr/jit/emit.cpp

Lines changed: 1 addition & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -4561,7 +4561,6 @@ void emitter::emitJumpDistBind()
45614561
else if (emitIsUncondJump(jmp))
45624562
{
45634563
// Nothing to do; we don't shrink these.
4564-
assert(jmp->idjShort);
45654564
ssz = JMP_SIZE_SMALL;
45664565
}
45674566
else if (emitIsLoadLabel(jmp))
@@ -6350,47 +6349,13 @@ unsigned emitter::emitEndCodeGen(Compiler* comp,
63506349
AllocMemArgs args;
63516350
memset(&args, 0, sizeof(args));
63526351

6353-
#if defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64)
6354-
// For arm64/LoongArch64, we want to allocate JIT data always adjacent to code similar to what native compiler does.
6355-
// This way allows us to use a single `ldr` to access such data like float constant/jmp table.
6356-
// For LoongArch64 using `pcaddi + ld` to access such data.
6357-
if (emitTotalColdCodeSize > 0)
6358-
{
6359-
// JIT data might be far away from the cold code.
6360-
NYI("Need to handle fix-up to data from cold code.");
6361-
}
6362-
6363-
UNATIVE_OFFSET roDataAlignmentDelta = 0;
6364-
if (emitConsDsc.dsdOffs > 0)
6365-
{
6366-
roDataAlignmentDelta = AlignmentPad(emitTotalHotCodeSize, dataAlignment);
6367-
}
6368-
6369-
args.hotCodeSize = emitTotalHotCodeSize + roDataAlignmentDelta + emitConsDsc.dsdOffs;
6370-
args.coldCodeSize = emitTotalColdCodeSize;
6371-
args.roDataSize = 0;
6372-
args.xcptnsCount = xcptnsCount;
6373-
args.flag = allocMemFlag;
6374-
6375-
emitComp->eeAllocMem(&args);
6376-
6377-
codeBlock = (BYTE*)args.hotCodeBlock;
6378-
codeBlockRW = (BYTE*)args.hotCodeBlockRW;
6379-
coldCodeBlock = (BYTE*)args.coldCodeBlock;
6380-
coldCodeBlockRW = (BYTE*)args.coldCodeBlockRW;
6381-
6382-
consBlock = codeBlock + emitTotalHotCodeSize + roDataAlignmentDelta;
6383-
consBlockRW = codeBlockRW + emitTotalHotCodeSize + roDataAlignmentDelta;
6384-
6385-
#else
6386-
63876352
args.hotCodeSize = emitTotalHotCodeSize;
63886353
args.coldCodeSize = emitTotalColdCodeSize;
63896354
args.roDataSize = emitConsDsc.dsdOffs;
63906355
args.xcptnsCount = xcptnsCount;
63916356
args.flag = allocMemFlag;
63926357

6393-
emitComp->eeAllocMem(&args);
6358+
emitComp->eeAllocMem(&args, emitConsDsc.alignment);
63946359

63956360
codeBlock = (BYTE*)args.hotCodeBlock;
63966361
codeBlockRW = (BYTE*)args.hotCodeBlockRW;
@@ -6399,8 +6364,6 @@ unsigned emitter::emitEndCodeGen(Compiler* comp,
63996364
consBlock = (BYTE*)args.roDataBlock;
64006365
consBlockRW = (BYTE*)args.roDataBlockRW;
64016366

6402-
#endif
6403-
64046367
#ifdef DEBUG
64056368
if ((allocMemFlag & CORJIT_ALLOCMEM_FLG_32BYTE_ALIGN) != 0)
64066369
{

src/coreclr/jit/emit.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -997,7 +997,7 @@ class emitter
997997
case IF_LARGELDC:
998998
if (isVectorRegister(idReg1()))
999999
{
1000-
// adrp + ldr + fmov
1000+
// (adrp + ldr + fmov) or (adrp + add + ld1)
10011001
size = 12;
10021002
}
10031003
else

0 commit comments

Comments
 (0)