Skip to content

Commit 54c5b47

Browse files
Switch to iSimdVector and Align WidenAsciiToUtf16 (#99982)
* Add AnyMatches() to iSimdVector interface * Switch to iSimdVector and Align WidenAsciiToUtf16. * Fixing perf * Addressing Review Comments. * Mirroring API change : #98055 (comment)
1 parent 700d724 commit 54c5b47

File tree

6 files changed

+138
-68
lines changed

6 files changed

+138
-68
lines changed

src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/ISimdVector_2.cs

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -553,6 +553,19 @@ static virtual bool TryCopyTo(TSelf vector, Span<T> destination)
553553
// New Surface Area
554554
//
555555

556+
/// <summary>Checks if any of the vector lanes are equivalent to value.</summary>
557+
/// <param name="vector">The Vector.</param>
558+
/// <param name="value">The Value to check.</param>
559+
/// <returns><c>true</c> if <paramref name="vector" /> has any lanes equivalent to <paramref name="value" /> otherwise, <c>false</c> if none of the lanes are equivalent to <paramref name="value" /> />.</returns>
560+
/// <exception cref="NotSupportedException">The type of the elements in the vector (<typeparamref name="T" />) is not supported.</exception>
561+
static abstract bool Any(TSelf vector, T value);
562+
563+
/// <summary>Checks if any of the vector lanes have All Bits set.</summary>
564+
/// <param name="vector">The Vector to check.</param>
565+
/// <returns><c>true</c> if <paramref name="vector" /> has any lanes with All Bits set otherwise, <c>false</c> if none of the lanes have All Bits set />.</returns>
566+
/// <exception cref="NotSupportedException">The type of the elements in the vector (<typeparamref name="T" />) is not supported.</exception>
567+
static abstract bool AnyWhereAllBitsSet(TSelf vector);
568+
556569
static abstract int IndexOfLastMatch(TSelf vector);
557570
}
558571
}

src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128_1.cs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -692,6 +692,16 @@ private string ToString([StringSyntax(StringSyntaxAttribute.NumericFormat)] stri
692692
// New Surface Area
693693
//
694694

695+
static bool ISimdVector<Vector128<T>, T>.AnyWhereAllBitsSet(Vector128<T> vector)
696+
{
697+
return (Vector128.EqualsAny(vector, Vector128<T>.AllBitsSet));
698+
}
699+
700+
static bool ISimdVector<Vector128<T>, T>.Any(Vector128<T> vector, T value)
701+
{
702+
return (Vector128.EqualsAny(vector, Vector128.Create((T)value)));
703+
}
704+
695705
static int ISimdVector<Vector128<T>, T>.IndexOfLastMatch(Vector128<T> vector)
696706
{
697707
uint mask = vector.ExtractMostSignificantBits();

src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256_1.cs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -682,6 +682,16 @@ private string ToString([StringSyntax(StringSyntaxAttribute.NumericFormat)] stri
682682
// New Surface Area
683683
//
684684

685+
static bool ISimdVector<Vector256<T>, T>.AnyWhereAllBitsSet(Vector256<T> vector)
686+
{
687+
return (Vector256.EqualsAny(vector, Vector256<T>.AllBitsSet));
688+
}
689+
690+
static bool ISimdVector<Vector256<T>, T>.Any(Vector256<T> vector, T value)
691+
{
692+
return (Vector256.EqualsAny(vector, Vector256.Create((T)value)));
693+
}
694+
685695
static int ISimdVector<Vector256<T>, T>.IndexOfLastMatch(Vector256<T> vector)
686696
{
687697
uint mask = vector.ExtractMostSignificantBits();

src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector512_1.cs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -682,6 +682,16 @@ private string ToString([StringSyntax(StringSyntaxAttribute.NumericFormat)] stri
682682
// New Surface Area
683683
//
684684

685+
static bool ISimdVector<Vector512<T>, T>.AnyWhereAllBitsSet(Vector512<T> vector)
686+
{
687+
return (Vector512.EqualsAny(vector, Vector512<T>.AllBitsSet));
688+
}
689+
690+
static bool ISimdVector<Vector512<T>, T>.Any(Vector512<T> vector, T value)
691+
{
692+
return (Vector512.EqualsAny(vector, Vector512.Create((T)value)));
693+
}
694+
685695
static int ISimdVector<Vector512<T>, T>.IndexOfLastMatch(Vector512<T> vector)
686696
{
687697
ulong mask = vector.ExtractMostSignificantBits();

src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector64_1.cs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -757,6 +757,16 @@ private string ToString([StringSyntax(StringSyntaxAttribute.NumericFormat)] stri
757757
// New Surface Area
758758
//
759759

760+
static bool ISimdVector<Vector64<T>, T>.AnyWhereAllBitsSet(Vector64<T> vector)
761+
{
762+
return (Vector64.EqualsAny(vector, Vector64<T>.AllBitsSet));
763+
}
764+
765+
static bool ISimdVector<Vector64<T>, T>.Any(Vector64<T> vector, T value)
766+
{
767+
return (Vector64.EqualsAny(vector, Vector64.Create((T)value)));
768+
}
769+
760770
static int ISimdVector<Vector64<T>, T>.IndexOfLastMatch(Vector64<T> vector)
761771
{
762772
uint mask = vector.ExtractMostSignificantBits();

src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs

Lines changed: 85 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -2038,79 +2038,17 @@ internal static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16B
20382038

20392039
if (BitConverter.IsLittleEndian && Vector128.IsHardwareAccelerated && elementCount >= (uint)Vector128<byte>.Count)
20402040
{
2041-
ushort* pCurrentWriteAddress = (ushort*)pUtf16Buffer;
2042-
2043-
if (Vector512.IsHardwareAccelerated && elementCount >= (uint)Vector512<byte>.Count)
2041+
if (Vector512.IsHardwareAccelerated && (elementCount - currentOffset) >= (uint)Vector512<byte>.Count)
20442042
{
2045-
// Calculating the destination address outside the loop results in significant
2046-
// perf wins vs. relying on the JIT to fold memory addressing logic into the
2047-
// write instructions. See: https://github.com/dotnet/runtime/issues/33002
2048-
nuint finalOffsetWhereCanRunLoop = elementCount - (uint)Vector512<byte>.Count;
2049-
2050-
do
2051-
{
2052-
Vector512<byte> asciiVector = Vector512.Load(pAsciiBuffer + currentOffset);
2053-
2054-
if (asciiVector.ExtractMostSignificantBits() != 0)
2055-
{
2056-
break;
2057-
}
2058-
2059-
(Vector512<ushort> utf16LowVector, Vector512<ushort> utf16HighVector) = Vector512.Widen(asciiVector);
2060-
utf16LowVector.Store(pCurrentWriteAddress);
2061-
utf16HighVector.Store(pCurrentWriteAddress + Vector512<ushort>.Count);
2062-
2063-
currentOffset += (nuint)Vector512<byte>.Count;
2064-
pCurrentWriteAddress += (nuint)Vector512<byte>.Count;
2065-
} while (currentOffset <= finalOffsetWhereCanRunLoop);
2043+
WidenAsciiToUtf1_Vector<Vector512<byte>, Vector512<ushort>>(pAsciiBuffer, pUtf16Buffer, ref currentOffset, elementCount);
20662044
}
2067-
else if (Vector256.IsHardwareAccelerated && elementCount >= (uint)Vector256<byte>.Count)
2045+
else if (Vector256.IsHardwareAccelerated && (elementCount - currentOffset) >= (uint)Vector256<byte>.Count)
20682046
{
2069-
// Calculating the destination address outside the loop results in significant
2070-
// perf wins vs. relying on the JIT to fold memory addressing logic into the
2071-
// write instructions. See: https://github.com/dotnet/runtime/issues/33002
2072-
nuint finalOffsetWhereCanRunLoop = elementCount - (uint)Vector256<byte>.Count;
2073-
2074-
do
2075-
{
2076-
Vector256<byte> asciiVector = Vector256.Load(pAsciiBuffer + currentOffset);
2077-
2078-
if (asciiVector.ExtractMostSignificantBits() != 0)
2079-
{
2080-
break;
2081-
}
2082-
2083-
(Vector256<ushort> utf16LowVector, Vector256<ushort> utf16HighVector) = Vector256.Widen(asciiVector);
2084-
utf16LowVector.Store(pCurrentWriteAddress);
2085-
utf16HighVector.Store(pCurrentWriteAddress + Vector256<ushort>.Count);
2086-
2087-
currentOffset += (nuint)Vector256<byte>.Count;
2088-
pCurrentWriteAddress += (nuint)Vector256<byte>.Count;
2089-
} while (currentOffset <= finalOffsetWhereCanRunLoop);
2047+
WidenAsciiToUtf1_Vector<Vector256<byte>, Vector256<ushort>>(pAsciiBuffer, pUtf16Buffer, ref currentOffset, elementCount);
20902048
}
2091-
else
2049+
else if (Vector128.IsHardwareAccelerated && (elementCount - currentOffset) >= (uint)Vector128<byte>.Count)
20922050
{
2093-
// Calculating the destination address outside the loop results in significant
2094-
// perf wins vs. relying on the JIT to fold memory addressing logic into the
2095-
// write instructions. See: https://github.com/dotnet/runtime/issues/33002
2096-
nuint finalOffsetWhereCanRunLoop = elementCount - (uint)Vector128<byte>.Count;
2097-
2098-
do
2099-
{
2100-
Vector128<byte> asciiVector = Vector128.Load(pAsciiBuffer + currentOffset);
2101-
2102-
if (VectorContainsNonAsciiChar(asciiVector))
2103-
{
2104-
break;
2105-
}
2106-
2107-
(Vector128<ushort> utf16LowVector, Vector128<ushort> utf16HighVector) = Vector128.Widen(asciiVector);
2108-
utf16LowVector.Store(pCurrentWriteAddress);
2109-
utf16HighVector.Store(pCurrentWriteAddress + Vector128<ushort>.Count);
2110-
2111-
currentOffset += (nuint)Vector128<byte>.Count;
2112-
pCurrentWriteAddress += (nuint)Vector128<byte>.Count;
2113-
} while (currentOffset <= finalOffsetWhereCanRunLoop);
2051+
WidenAsciiToUtf1_Vector<Vector128<byte>, Vector128<ushort>>(pAsciiBuffer, pUtf16Buffer, ref currentOffset, elementCount);
21142052
}
21152053
}
21162054

@@ -2212,6 +2150,85 @@ internal static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16B
22122150
goto Finish;
22132151
}
22142152

2153+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
2154+
private static unsafe void WidenAsciiToUtf1_Vector<TVectorByte, TVectorUInt16>(byte* pAsciiBuffer, char* pUtf16Buffer, ref nuint currentOffset, nuint elementCount)
2155+
where TVectorByte : unmanaged, ISimdVector<TVectorByte, byte>
2156+
where TVectorUInt16 : unmanaged, ISimdVector<TVectorUInt16, ushort>
2157+
{
2158+
ushort* pCurrentWriteAddress = (ushort*)pUtf16Buffer;
2159+
// Calculating the destination address outside the loop results in significant
2160+
// perf wins vs. relying on the JIT to fold memory addressing logic into the
2161+
// write instructions. See: https://github.com/dotnet/runtime/issues/33002
2162+
nuint finalOffsetWhereCanRunLoop = elementCount - (nuint)TVectorByte.Count;
2163+
TVectorByte asciiVector = TVectorByte.Load(pAsciiBuffer + currentOffset);
2164+
if (!HasMatch<TVectorByte>(asciiVector))
2165+
{
2166+
(TVectorUInt16 utf16LowVector, TVectorUInt16 utf16HighVector) = Widen<TVectorByte, TVectorUInt16>(asciiVector);
2167+
utf16LowVector.Store(pCurrentWriteAddress);
2168+
utf16HighVector.Store(pCurrentWriteAddress + TVectorUInt16.Count);
2169+
pCurrentWriteAddress += (nuint)(TVectorUInt16.Count * 2);
2170+
if (((nuint)pCurrentWriteAddress % sizeof(char)) == 0)
2171+
{
2172+
// Bump write buffer up to the next aligned boundary
2173+
pCurrentWriteAddress = (ushort*)((nuint)pCurrentWriteAddress & ~(nuint)(TVectorUInt16.Alignment - 1));
2174+
nuint numBytesWritten = (nuint)pCurrentWriteAddress - (nuint)pUtf16Buffer;
2175+
currentOffset += (nuint)numBytesWritten / 2;
2176+
}
2177+
else
2178+
{
2179+
// If input isn't char aligned, we won't be able to align it to a Vector
2180+
currentOffset += (nuint)TVectorByte.Count;
2181+
}
2182+
while (currentOffset <= finalOffsetWhereCanRunLoop)
2183+
{
2184+
asciiVector = TVectorByte.Load(pAsciiBuffer + currentOffset);
2185+
if (HasMatch<TVectorByte>(asciiVector))
2186+
{
2187+
break;
2188+
}
2189+
(utf16LowVector, utf16HighVector) = Widen<TVectorByte, TVectorUInt16>(asciiVector);
2190+
utf16LowVector.Store(pCurrentWriteAddress);
2191+
utf16HighVector.Store(pCurrentWriteAddress + TVectorUInt16.Count);
2192+
2193+
currentOffset += (nuint)TVectorByte.Count;
2194+
pCurrentWriteAddress += (nuint)(TVectorUInt16.Count * 2);
2195+
}
2196+
}
2197+
return;
2198+
}
2199+
2200+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
2201+
private static unsafe bool HasMatch<TVectorByte>(TVectorByte vector)
2202+
where TVectorByte : unmanaged, ISimdVector<TVectorByte, byte>
2203+
{
2204+
return !(vector & TVectorByte.Create((byte)0x80)).Equals(TVectorByte.Zero);
2205+
}
2206+
2207+
2208+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
2209+
private static unsafe (TVectorUInt16 Lower, TVectorUInt16 Upper) Widen<TVectorByte, TVectorUInt16>(TVectorByte vector)
2210+
where TVectorByte : unmanaged, ISimdVector<TVectorByte, byte>
2211+
where TVectorUInt16 : unmanaged, ISimdVector<TVectorUInt16, ushort>
2212+
{
2213+
if (typeof(TVectorByte) == typeof(Vector256<byte>))
2214+
{
2215+
(Vector256<ushort> Lower256, Vector256<ushort> Upper256) = Vector256.Widen((Vector256<byte>)(object)vector);
2216+
return ((TVectorUInt16)(object)Lower256, (TVectorUInt16)(object)Upper256);
2217+
}
2218+
else if (typeof(TVectorByte) == typeof(Vector512<byte>))
2219+
{
2220+
(Vector512<ushort> Lower512, Vector512<ushort> Upper512) = Vector512.Widen((Vector512<byte>)(object)vector);
2221+
return ((TVectorUInt16)(object)Lower512, (TVectorUInt16)(object)Upper512);
2222+
}
2223+
else
2224+
{
2225+
Debug.Assert(typeof(TVectorByte) == typeof(Vector128<byte>));
2226+
(Vector128<ushort> Lower128, Vector128<ushort> Upper128) = Vector128.Widen((Vector128<byte>)(object)vector);
2227+
return ((TVectorUInt16)(object)Lower128, (TVectorUInt16)(object)Upper128);
2228+
}
2229+
}
2230+
2231+
22152232
/// <summary>
22162233
/// Given a DWORD which represents a buffer of 4 bytes, widens the buffer into 4 WORDs and
22172234
/// writes them to the output buffer with machine endianness.

0 commit comments

Comments
 (0)