Skip to content

Commit d35f302

Browse files
authored
Port yield normalization from CoreCLR to Native AOT (#103675)
1 parent 8c95a64 commit d35f302

12 files changed

+390
-653
lines changed

src/coreclr/gc/env/gcenv.os.h

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,6 @@
66
#ifndef __GCENV_OS_H__
77
#define __GCENV_OS_H__
88

9-
#ifdef HAS_SYSTEM_YIELDPROCESSOR
10-
// YieldProcessor is defined to Dont_Use_YieldProcessor. Restore it to the system-default implementation for the GC.
11-
#undef YieldProcessor
12-
#define YieldProcessor System_YieldProcessor
13-
#endif
14-
159
#define NUMA_NODE_UNDEFINED UINT16_MAX
1610

1711
bool ParseIndexOrRange(const char** config_string, size_t* start_index, size_t* end_index);

src/coreclr/inc/yieldprocessornormalized.h

Lines changed: 18 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,11 @@
33

44
#pragma once
55

6-
// Undefine YieldProcessor to encourage using the normalized versions below instead. System_YieldProcessor() can be used where
7-
// the intention is to use the system-default implementation of YieldProcessor().
8-
#define HAS_SYSTEM_YIELDPROCESSOR
6+
#ifdef FEATURE_NATIVEAOT
7+
FORCEINLINE void System_YieldProcessor() { PalYieldProcessor(); }
8+
#else
99
FORCEINLINE void System_YieldProcessor() { YieldProcessor(); }
10-
#ifdef YieldProcessor
11-
#undef YieldProcessor
1210
#endif
13-
#define YieldProcessor Dont_Use_YieldProcessor
1411

1512
#define DISABLE_COPY(T) \
1613
T(const T &) = delete; \
@@ -144,17 +141,17 @@ FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo
144141
{
145142
_ASSERTE(count != 0);
146143

147-
if (sizeof(SIZE_T) <= sizeof(unsigned int))
144+
if (sizeof(size_t) <= sizeof(unsigned int))
148145
{
149-
// On platforms with a small SIZE_T, prevent overflow on the multiply below
146+
// On platforms with a small size_t, prevent overflow on the multiply below
150147
const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield;
151148
if (count > MaxCount)
152149
{
153150
count = MaxCount;
154151
}
155152
}
156153

157-
SIZE_T n = (SIZE_T)count * normalizationInfo.yieldsPerNormalizedYield;
154+
size_t n = (size_t)count * normalizationInfo.yieldsPerNormalizedYield;
158155
_ASSERTE(n != 0);
159156
do
160157
{
@@ -189,9 +186,9 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(
189186
{
190187
_ASSERTE(preSkylakeCount != 0);
191188

192-
if (sizeof(SIZE_T) <= sizeof(unsigned int))
189+
if (sizeof(size_t) <= sizeof(unsigned int))
193190
{
194-
// On platforms with a small SIZE_T, prevent overflow on the multiply below
191+
// On platforms with a small size_t, prevent overflow on the multiply below
195192
const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield;
196193
if (preSkylakeCount > MaxCount)
197194
{
@@ -200,7 +197,7 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(
200197
}
201198

202199
const unsigned int PreSkylakeCountToSkylakeCountDivisor = 8;
203-
SIZE_T n = (SIZE_T)preSkylakeCount * normalizationInfo.yieldsPerNormalizedYield / PreSkylakeCountToSkylakeCountDivisor;
200+
size_t n = (size_t)preSkylakeCount * normalizationInfo.yieldsPerNormalizedYield / PreSkylakeCountToSkylakeCountDivisor;
204201
if (n == 0)
205202
{
206203
n = 1;
@@ -227,9 +224,9 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(unsigned int preSkyl
227224

228225
_ASSERTE(preSkylakeCount != 0);
229226

230-
if (sizeof(SIZE_T) <= sizeof(unsigned int))
227+
if (sizeof(size_t) <= sizeof(unsigned int))
231228
{
232-
// On platforms with a small SIZE_T, prevent overflow on the multiply below
229+
// On platforms with a small size_t, prevent overflow on the multiply below
233230
const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield;
234231
if (preSkylakeCount > MaxCount)
235232
{
@@ -238,8 +235,8 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(unsigned int preSkyl
238235
}
239236

240237
const unsigned int PreSkylakeCountToSkylakeCountDivisor = 8;
241-
SIZE_T n =
242-
(SIZE_T)preSkylakeCount *
238+
size_t n =
239+
(size_t)preSkylakeCount *
243240
YieldProcessorNormalization::s_yieldsPerNormalizedYield /
244241
PreSkylakeCountToSkylakeCountDivisor;
245242
if (n == 0)
@@ -268,11 +265,11 @@ FORCEINLINE void YieldProcessorWithBackOffNormalized(
268265
unsigned int spinIteration)
269266
{
270267
// This shift value should be adjusted based on the asserted conditions below
271-
const UINT8 MaxShift = 3;
272-
static_assert_no_msg(
273-
((unsigned int)1 << MaxShift) <= YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration);
274-
static_assert_no_msg(
275-
((unsigned int)1 << (MaxShift + 1)) > YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration);
268+
const uint8_t MaxShift = 3;
269+
static_assert(
270+
((unsigned int)1 << MaxShift) <= YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration, "");
271+
static_assert(
272+
((unsigned int)1 << (MaxShift + 1)) > YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration, "");
276273

277274
unsigned int n;
278275
if (spinIteration <= MaxShift &&

src/coreclr/nativeaot/Runtime/Crst.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ enum CrstType
2020
CrstRestrictedCallouts,
2121
CrstGcStressControl,
2222
CrstThreadStore,
23-
CrstYieldProcessorNormalized,
2423
CrstEventPipe,
2524
CrstEventPipeConfig,
2625
CrstGcEvent,

src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,6 @@ uint32_t WINAPI FinalizerStart(void* pContext)
4848

4949
g_pFinalizerThread = PTR_Thread(pThread);
5050

51-
// We have some time until the first finalization request - use the time to calibrate normalized waits.
52-
EnsureYieldProcessorNormalizedInitialized();
53-
5451
// Wait for a finalization request.
5552
uint32_t uResult = PalWaitForSingleObjectEx(hFinalizerEvent, INFINITE, FALSE);
5653
ASSERT(uResult == WAIT_OBJECT_0);
@@ -184,6 +181,11 @@ EXTERN_C void QCALLTYPE RhpSignalFinalizationComplete(uint32_t fcount)
184181
{
185182
FireEtwGCFinalizersEnd_V1(fcount, GetClrInstanceId());
186183
g_FinalizerDoneEvent.Set();
184+
185+
if (YieldProcessorNormalization::IsMeasurementScheduled())
186+
{
187+
YieldProcessorNormalization::PerformMeasurement();
188+
}
187189
}
188190

189191
//

src/coreclr/nativeaot/Runtime/eventpipe/gen-eventing-event-inc.lst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,3 +113,4 @@ ThreadPoolWorkingThreadCount
113113
ThreadRunning
114114
WaitHandleWaitStart
115115
WaitHandleWaitStop
116+
YieldProcessorMeasurement

src/coreclr/nativeaot/Runtime/startup.cpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -133,8 +133,6 @@ static bool InitDLL(HANDLE hPalInstance)
133133
#endif
134134
#endif // !USE_PORTABLE_HELPERS
135135

136-
InitializeYieldProcessorNormalizedCrst();
137-
138136
#ifdef STRESS_LOG
139137
uint32_t dwTotalStressLogSize = (uint32_t)g_pRhConfig->GetTotalStressLogSize();
140138
uint32_t dwStressLogLevel = (uint32_t)g_pRhConfig->GetStressLogLevel();

src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,26 @@ FORCEINLINE int64_t PalInterlockedCompareExchange64(_Inout_ int64_t volatile *pD
5656
return _InterlockedCompareExchange64(pDst, iValue, iComparand);
5757
}
5858

59+
#ifdef HOST_X86
60+
FORCEINLINE int64_t PalInterlockedExchange64(_Inout_ int64_t volatile *pDst, int64_t iValue)
61+
{
62+
int64_t iOldValue;
63+
do {
64+
iOldValue = *pDst;
65+
} while (PalInterlockedCompareExchange64(pDst,
66+
iValue,
67+
iOldValue) != iOldValue);
68+
return iOldValue;
69+
}
70+
#else // HOST_X86
71+
EXTERN_C int64_t _InterlockedExchange64(int64_t volatile *, int64_t);
72+
#pragma intrinsic(_InterlockedExchange64)
73+
FORCEINLINE int64_t PalInterlockedExchange64(_Inout_ int64_t volatile *pDst, int64_t iValue)
74+
{
75+
return _InterlockedExchange64(pDst, iValue);
76+
}
77+
#endif // HOST_X86
78+
5979
#if defined(HOST_AMD64) || defined(HOST_ARM64)
6080
EXTERN_C uint8_t _InterlockedCompareExchange128(int64_t volatile *, int64_t, int64_t, int64_t *);
6181
#pragma intrinsic(_InterlockedCompareExchange128)

src/coreclr/nativeaot/Runtime/yieldprocessornormalized.cpp

Lines changed: 2 additions & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -15,104 +15,6 @@
1515
#include "volatile.h"
1616
#include "yieldprocessornormalized.h"
1717

18-
#define ULONGLONG int64_t
18+
#include "../../utilcode/yieldprocessornormalized.cpp"
1919

20-
static Volatile<bool> s_isYieldProcessorNormalizedInitialized = false;
21-
static CrstStatic s_initializeYieldProcessorNormalizedCrst;
22-
23-
// Defaults are for when InitializeYieldProcessorNormalized has not yet been called or when no measurement is done, and are
24-
// tuned for Skylake processors
25-
unsigned int g_yieldsPerNormalizedYield = 1; // current value is for Skylake processors, this is expected to be ~8 for pre-Skylake
26-
unsigned int g_optimalMaxNormalizedYieldsPerSpinIteration = 7;
27-
28-
void InitializeYieldProcessorNormalizedCrst()
29-
{
30-
WRAPPER_NO_CONTRACT;
31-
s_initializeYieldProcessorNormalizedCrst.Init(CrstYieldProcessorNormalized);
32-
}
33-
34-
static void InitializeYieldProcessorNormalized()
35-
{
36-
WRAPPER_NO_CONTRACT;
37-
38-
CrstHolder lock(&s_initializeYieldProcessorNormalizedCrst);
39-
40-
if (s_isYieldProcessorNormalizedInitialized)
41-
{
42-
return;
43-
}
44-
45-
// Intel pre-Skylake processor: measured typically 14-17 cycles per yield
46-
// Intel post-Skylake processor: measured typically 125-150 cycles per yield
47-
const int MeasureDurationMs = 10;
48-
const int NsPerSecond = 1000 * 1000 * 1000;
49-
50-
ULONGLONG ticksPerSecond = PalQueryPerformanceFrequency();
51-
52-
if (ticksPerSecond < 1000 / MeasureDurationMs)
53-
{
54-
// High precision clock not available or clock resolution is too low, resort to defaults
55-
s_isYieldProcessorNormalizedInitialized = true;
56-
return;
57-
}
58-
59-
// Measure the nanosecond delay per yield
60-
ULONGLONG measureDurationTicks = ticksPerSecond / (1000 / MeasureDurationMs);
61-
unsigned int yieldCount = 0;
62-
ULONGLONG startTicks = PalQueryPerformanceCounter();
63-
ULONGLONG elapsedTicks;
64-
do
65-
{
66-
// On some systems, querying the high performance counter has relatively significant overhead. Do enough yields to mask
67-
// the timing overhead. Assuming one yield has a delay of MinNsPerNormalizedYield, 1000 yields would have a delay in the
68-
// low microsecond range.
69-
for (int i = 0; i < 1000; ++i)
70-
{
71-
System_YieldProcessor();
72-
}
73-
yieldCount += 1000;
74-
75-
ULONGLONG nowTicks = PalQueryPerformanceCounter();
76-
elapsedTicks = nowTicks - startTicks;
77-
} while (elapsedTicks < measureDurationTicks);
78-
double nsPerYield = (double)elapsedTicks * NsPerSecond / ((double)yieldCount * ticksPerSecond);
79-
if (nsPerYield < 1)
80-
{
81-
nsPerYield = 1;
82-
}
83-
84-
// Calculate the number of yields required to span the duration of a normalized yield. Since nsPerYield is at least 1, this
85-
// value is naturally limited to MinNsPerNormalizedYield.
86-
int yieldsPerNormalizedYield = (int)(MinNsPerNormalizedYield / nsPerYield + 0.5);
87-
if (yieldsPerNormalizedYield < 1)
88-
{
89-
yieldsPerNormalizedYield = 1;
90-
}
91-
_ASSERTE(yieldsPerNormalizedYield <= (int)MinNsPerNormalizedYield);
92-
93-
// Calculate the maximum number of yields that would be optimal for a late spin iteration. Typically, we would not want to
94-
// spend excessive amounts of time (thousands of cycles) doing only YieldProcessor, as SwitchToThread/Sleep would do a
95-
// better job of allowing other work to run.
96-
int optimalMaxNormalizedYieldsPerSpinIteration =
97-
(int)(NsPerOptimalMaxSpinIterationDuration / (yieldsPerNormalizedYield * nsPerYield) + 0.5);
98-
if (optimalMaxNormalizedYieldsPerSpinIteration < 1)
99-
{
100-
optimalMaxNormalizedYieldsPerSpinIteration = 1;
101-
}
102-
103-
g_yieldsPerNormalizedYield = yieldsPerNormalizedYield;
104-
g_optimalMaxNormalizedYieldsPerSpinIteration = optimalMaxNormalizedYieldsPerSpinIteration;
105-
s_isYieldProcessorNormalizedInitialized = true;
106-
107-
GCHeapUtilities::GetGCHeap()->SetYieldProcessorScalingFactor((float)yieldsPerNormalizedYield);
108-
}
109-
110-
void EnsureYieldProcessorNormalizedInitialized()
111-
{
112-
WRAPPER_NO_CONTRACT;
113-
114-
if (!s_isYieldProcessorNormalizedInitialized)
115-
{
116-
InitializeYieldProcessorNormalized();
117-
}
118-
}
20+
#include "../../vm/yieldprocessornormalizedshared.cpp"

0 commit comments

Comments
 (0)