Port yield normalization from CoreCLR to Native AOT (#103675)

eduardo-vp · web-flow · commit d35f3021b91d · 2024-07-17T16:19:42.000-07:00
diff --git a/src/coreclr/gc/env/gcenv.os.h b/src/coreclr/gc/env/gcenv.os.h
@@ -6,12 +6,6 @@
 #ifndef __GCENV_OS_H__
 #define __GCENV_OS_H__
 
-#ifdef HAS_SYSTEM_YIELDPROCESSOR
-// YieldProcessor is defined to Dont_Use_YieldProcessor. Restore it to the system-default implementation for the GC.
-#undef YieldProcessor
-#define YieldProcessor System_YieldProcessor
-#endif
-
 #define NUMA_NODE_UNDEFINED UINT16_MAX
 
 bool ParseIndexOrRange(const char** config_string, size_t* start_index, size_t* end_index);
diff --git a/src/coreclr/inc/yieldprocessornormalized.h b/src/coreclr/inc/yieldprocessornormalized.h
@@ -3,14 +3,11 @@
 
 #pragma once
 
-// Undefine YieldProcessor to encourage using the normalized versions below instead. System_YieldProcessor() can be used where
-// the intention is to use the system-default implementation of YieldProcessor().
-#define HAS_SYSTEM_YIELDPROCESSOR
+#ifdef FEATURE_NATIVEAOT
+FORCEINLINE void System_YieldProcessor() { PalYieldProcessor(); }
+#else
 FORCEINLINE void System_YieldProcessor() { YieldProcessor(); }
-#ifdef YieldProcessor
-#undef YieldProcessor
 #endif
-#define YieldProcessor Dont_Use_YieldProcessor
 
 #define DISABLE_COPY(T) \
     T(const T &) = delete; \
@@ -144,17 +141,17 @@ FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo
 {
     _ASSERTE(count != 0);
 
-    if (sizeof(SIZE_T) <= sizeof(unsigned int))
+    if (sizeof(size_t) <= sizeof(unsigned int))
     {
-        // On platforms with a small SIZE_T, prevent overflow on the multiply below
+        // On platforms with a small size_t, prevent overflow on the multiply below
         const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield;
         if (count > MaxCount)
         {
             count = MaxCount;
         }
     }
 
-    SIZE_T n = (SIZE_T)count * normalizationInfo.yieldsPerNormalizedYield;
+    size_t n = (size_t)count * normalizationInfo.yieldsPerNormalizedYield;
     _ASSERTE(n != 0);
     do
     {
@@ -189,9 +186,9 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(
 {
     _ASSERTE(preSkylakeCount != 0);
 
-    if (sizeof(SIZE_T) <= sizeof(unsigned int))
+    if (sizeof(size_t) <= sizeof(unsigned int))
     {
-        // On platforms with a small SIZE_T, prevent overflow on the multiply below
+        // On platforms with a small size_t, prevent overflow on the multiply below
         const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield;
         if (preSkylakeCount > MaxCount)
         {
@@ -200,7 +197,7 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(
     }
 
     const unsigned int PreSkylakeCountToSkylakeCountDivisor = 8;
-    SIZE_T n = (SIZE_T)preSkylakeCount * normalizationInfo.yieldsPerNormalizedYield / PreSkylakeCountToSkylakeCountDivisor;
+    size_t n = (size_t)preSkylakeCount * normalizationInfo.yieldsPerNormalizedYield / PreSkylakeCountToSkylakeCountDivisor;
     if (n == 0)
     {
         n = 1;
@@ -227,9 +224,9 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(unsigned int preSkyl
 
     _ASSERTE(preSkylakeCount != 0);
 
-    if (sizeof(SIZE_T) <= sizeof(unsigned int))
+    if (sizeof(size_t) <= sizeof(unsigned int))
     {
-        // On platforms with a small SIZE_T, prevent overflow on the multiply below
+        // On platforms with a small size_t, prevent overflow on the multiply below
         const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield;
         if (preSkylakeCount > MaxCount)
         {
@@ -238,8 +235,8 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(unsigned int preSkyl
     }
 
     const unsigned int PreSkylakeCountToSkylakeCountDivisor = 8;
-    SIZE_T n =
-        (SIZE_T)preSkylakeCount *
+    size_t n =
+        (size_t)preSkylakeCount *
         YieldProcessorNormalization::s_yieldsPerNormalizedYield /
         PreSkylakeCountToSkylakeCountDivisor;
     if (n == 0)
@@ -268,11 +265,11 @@ FORCEINLINE void YieldProcessorWithBackOffNormalized(
     unsigned int spinIteration)
 {
     // This shift value should be adjusted based on the asserted conditions below
-    const UINT8 MaxShift = 3;
-    static_assert_no_msg(
-        ((unsigned int)1 << MaxShift) <= YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration);
-    static_assert_no_msg(
-        ((unsigned int)1 << (MaxShift + 1)) > YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration);
+    const uint8_t MaxShift = 3;
+    static_assert(
+        ((unsigned int)1 << MaxShift) <= YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration, "");
+    static_assert(
+        ((unsigned int)1 << (MaxShift + 1)) > YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration, "");
 
     unsigned int n;
     if (spinIteration <= MaxShift &&
diff --git a/src/coreclr/nativeaot/Runtime/Crst.h b/src/coreclr/nativeaot/Runtime/Crst.h
@@ -20,7 +20,6 @@ enum CrstType
     CrstRestrictedCallouts,
     CrstGcStressControl,
     CrstThreadStore,
-    CrstYieldProcessorNormalized,
     CrstEventPipe,
     CrstEventPipeConfig,
     CrstGcEvent,
diff --git a/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp b/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp
@@ -48,9 +48,6 @@ uint32_t WINAPI FinalizerStart(void* pContext)
 
     g_pFinalizerThread = PTR_Thread(pThread);
 
-    // We have some time until the first finalization request - use the time to calibrate normalized waits.
-    EnsureYieldProcessorNormalizedInitialized();
-
     // Wait for a finalization request.
     uint32_t uResult = PalWaitForSingleObjectEx(hFinalizerEvent, INFINITE, FALSE);
     ASSERT(uResult == WAIT_OBJECT_0);
@@ -184,6 +181,11 @@ EXTERN_C void QCALLTYPE RhpSignalFinalizationComplete(uint32_t fcount)
 {
     FireEtwGCFinalizersEnd_V1(fcount, GetClrInstanceId());
     g_FinalizerDoneEvent.Set();
+
+    if (YieldProcessorNormalization::IsMeasurementScheduled())
+    {
+        YieldProcessorNormalization::PerformMeasurement();
+    }
 }
 
 //
diff --git a/src/coreclr/nativeaot/Runtime/eventpipe/gen-eventing-event-inc.lst b/src/coreclr/nativeaot/Runtime/eventpipe/gen-eventing-event-inc.lst
@@ -113,3 +113,4 @@ ThreadPoolWorkingThreadCount
 ThreadRunning
 WaitHandleWaitStart
 WaitHandleWaitStop
+YieldProcessorMeasurement
diff --git a/src/coreclr/nativeaot/Runtime/startup.cpp b/src/coreclr/nativeaot/Runtime/startup.cpp
@@ -133,8 +133,6 @@ static bool InitDLL(HANDLE hPalInstance)
 #endif
 #endif // !USE_PORTABLE_HELPERS
 
-    InitializeYieldProcessorNormalizedCrst();
-
 #ifdef STRESS_LOG
     uint32_t dwTotalStressLogSize = (uint32_t)g_pRhConfig->GetTotalStressLogSize();
     uint32_t dwStressLogLevel = (uint32_t)g_pRhConfig->GetStressLogLevel();
diff --git a/src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h b/src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h
@@ -56,6 +56,26 @@ FORCEINLINE int64_t PalInterlockedCompareExchange64(_Inout_ int64_t volatile *pD
     return _InterlockedCompareExchange64(pDst, iValue, iComparand);
 }
 
+#ifdef HOST_X86
+FORCEINLINE int64_t PalInterlockedExchange64(_Inout_ int64_t volatile *pDst, int64_t iValue)
+{
+    int64_t iOldValue;
+    do {
+        iOldValue = *pDst;
+    } while (PalInterlockedCompareExchange64(pDst,
+                                          iValue,
+                                          iOldValue) != iOldValue);
+    return iOldValue;
+}
+#else // HOST_X86
+EXTERN_C int64_t _InterlockedExchange64(int64_t volatile *, int64_t);
+#pragma intrinsic(_InterlockedExchange64)
+FORCEINLINE int64_t PalInterlockedExchange64(_Inout_ int64_t volatile *pDst, int64_t iValue)
+{
+    return _InterlockedExchange64(pDst, iValue);
+}
+#endif // HOST_X86
+
 #if defined(HOST_AMD64) || defined(HOST_ARM64)
 EXTERN_C uint8_t _InterlockedCompareExchange128(int64_t volatile *, int64_t, int64_t, int64_t *);
 #pragma intrinsic(_InterlockedCompareExchange128)
diff --git a/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.cpp b/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.cpp
@@ -15,104 +15,6 @@
 #include "volatile.h"
 #include "yieldprocessornormalized.h"
 
-#define ULONGLONG int64_t
+#include "../../utilcode/yieldprocessornormalized.cpp"
 
-static Volatile<bool> s_isYieldProcessorNormalizedInitialized = false;
-static CrstStatic s_initializeYieldProcessorNormalizedCrst;
-
-// Defaults are for when InitializeYieldProcessorNormalized has not yet been called or when no measurement is done, and are
-// tuned for Skylake processors
-unsigned int g_yieldsPerNormalizedYield = 1; // current value is for Skylake processors, this is expected to be ~8 for pre-Skylake
-unsigned int g_optimalMaxNormalizedYieldsPerSpinIteration = 7;
-
-void InitializeYieldProcessorNormalizedCrst()
-{
-    WRAPPER_NO_CONTRACT;
-    s_initializeYieldProcessorNormalizedCrst.Init(CrstYieldProcessorNormalized);
-}
-
-static void InitializeYieldProcessorNormalized()
-{
-    WRAPPER_NO_CONTRACT;
-
-    CrstHolder lock(&s_initializeYieldProcessorNormalizedCrst);
-
-    if (s_isYieldProcessorNormalizedInitialized)
-    {
-        return;
-    }
-
-    // Intel pre-Skylake processor: measured typically 14-17 cycles per yield
-    // Intel post-Skylake processor: measured typically 125-150 cycles per yield
-    const int MeasureDurationMs = 10;
-    const int NsPerSecond = 1000 * 1000 * 1000;
-
-    ULONGLONG ticksPerSecond = PalQueryPerformanceFrequency();
-
-    if (ticksPerSecond < 1000 / MeasureDurationMs)
-    {
-        // High precision clock not available or clock resolution is too low, resort to defaults
-        s_isYieldProcessorNormalizedInitialized = true;
-        return;
-    }
-
-    // Measure the nanosecond delay per yield
-    ULONGLONG measureDurationTicks = ticksPerSecond / (1000 / MeasureDurationMs);
-    unsigned int yieldCount = 0;
-      ULONGLONG startTicks = PalQueryPerformanceCounter();
-    ULONGLONG elapsedTicks;
-    do
-    {
-        // On some systems, querying the high performance counter has relatively significant overhead. Do enough yields to mask
-        // the timing overhead. Assuming one yield has a delay of MinNsPerNormalizedYield, 1000 yields would have a delay in the
-        // low microsecond range.
-        for (int i = 0; i < 1000; ++i)
-        {
-            System_YieldProcessor();
-        }
-        yieldCount += 1000;
-
-        ULONGLONG nowTicks = PalQueryPerformanceCounter();
-        elapsedTicks = nowTicks - startTicks;
-    } while (elapsedTicks < measureDurationTicks);
-    double nsPerYield = (double)elapsedTicks * NsPerSecond / ((double)yieldCount * ticksPerSecond);
-    if (nsPerYield < 1)
-    {
-        nsPerYield = 1;
-    }
-
-    // Calculate the number of yields required to span the duration of a normalized yield. Since nsPerYield is at least 1, this
-    // value is naturally limited to MinNsPerNormalizedYield.
-    int yieldsPerNormalizedYield = (int)(MinNsPerNormalizedYield / nsPerYield + 0.5);
-    if (yieldsPerNormalizedYield < 1)
-    {
-        yieldsPerNormalizedYield = 1;
-    }
-    _ASSERTE(yieldsPerNormalizedYield <= (int)MinNsPerNormalizedYield);
-
-    // Calculate the maximum number of yields that would be optimal for a late spin iteration. Typically, we would not want to
-    // spend excessive amounts of time (thousands of cycles) doing only YieldProcessor, as SwitchToThread/Sleep would do a
-    // better job of allowing other work to run.
-    int optimalMaxNormalizedYieldsPerSpinIteration =
-        (int)(NsPerOptimalMaxSpinIterationDuration / (yieldsPerNormalizedYield * nsPerYield) + 0.5);
-    if (optimalMaxNormalizedYieldsPerSpinIteration < 1)
-    {
-        optimalMaxNormalizedYieldsPerSpinIteration = 1;
-    }
-
-    g_yieldsPerNormalizedYield = yieldsPerNormalizedYield;
-    g_optimalMaxNormalizedYieldsPerSpinIteration = optimalMaxNormalizedYieldsPerSpinIteration;
-    s_isYieldProcessorNormalizedInitialized = true;
-
-    GCHeapUtilities::GetGCHeap()->SetYieldProcessorScalingFactor((float)yieldsPerNormalizedYield);
-}
-
-void EnsureYieldProcessorNormalizedInitialized()
-{
-    WRAPPER_NO_CONTRACT;
-
-    if (!s_isYieldProcessorNormalizedInitialized)
-    {
-        InitializeYieldProcessorNormalized();
-    }
-}
+#include "../../vm/yieldprocessornormalizedshared.cpp"
diff --git a/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.h b/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.h
diff --git a/src/coreclr/utilcode/yieldprocessornormalized.cpp b/src/coreclr/utilcode/yieldprocessornormalized.cpp
diff --git a/src/coreclr/vm/yieldprocessornormalized.cpp b/src/coreclr/vm/yieldprocessornormalized.cpp
diff --git a/src/coreclr/vm/yieldprocessornormalizedshared.cpp b/src/coreclr/vm/yieldprocessornormalizedshared.cpp