@@ -2038,79 +2038,17 @@ internal static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16B
2038
2038
2039
2039
if ( BitConverter . IsLittleEndian && Vector128 . IsHardwareAccelerated && elementCount >= ( uint ) Vector128 < byte > . Count )
2040
2040
{
2041
- ushort * pCurrentWriteAddress = ( ushort * ) pUtf16Buffer;
2042
-
2043
- if ( Vector512 . IsHardwareAccelerated && elementCount >= ( uint ) Vector512 < byte > . Count )
2041
+ if ( Vector512 . IsHardwareAccelerated && ( elementCount - currentOffset ) >= ( uint ) Vector512 < byte > . Count )
2044
2042
{
2045
- // Calculating the destination address outside the loop results in significant
2046
- // perf wins vs. relying on the JIT to fold memory addressing logic into the
2047
- // write instructions. See: https://github.com/dotnet/runtime/issues/33002
2048
- nuint finalOffsetWhereCanRunLoop = elementCount - ( uint ) Vector512 < byte > . Count ;
2049
-
2050
- do
2051
- {
2052
- Vector512 < byte > asciiVector = Vector512 . Load ( pAsciiBuffer + currentOffset ) ;
2053
-
2054
- if ( asciiVector . ExtractMostSignificantBits ( ) != 0 )
2055
- {
2056
- break ;
2057
- }
2058
-
2059
- ( Vector512 < ushort > utf16LowVector , Vector512 < ushort > utf16HighVector ) = Vector512 . Widen ( asciiVector ) ;
2060
- utf16LowVector . Store ( pCurrentWriteAddress ) ;
2061
- utf16HighVector . Store ( pCurrentWriteAddress + Vector512 < ushort > . Count ) ;
2062
-
2063
- currentOffset += ( nuint ) Vector512 < byte > . Count ;
2064
- pCurrentWriteAddress += ( nuint ) Vector512 < byte > . Count ;
2065
- } while ( currentOffset < = finalOffsetWhereCanRunLoop ) ;
2043
+ WidenAsciiToUtf1_Vector < Vector512 < byte > , Vector512 < ushort > > ( pAsciiBuffer , pUtf16Buffer , ref currentOffset , elementCount ) ;
2066
2044
}
2067
- else if ( Vector256 . IsHardwareAccelerated && elementCount >= ( uint ) Vector256 < byte > . Count )
2045
+ else if ( Vector256 . IsHardwareAccelerated && ( elementCount - currentOffset ) >= ( uint ) Vector256 < byte > . Count )
2068
2046
{
2069
- // Calculating the destination address outside the loop results in significant
2070
- // perf wins vs. relying on the JIT to fold memory addressing logic into the
2071
- // write instructions. See: https://github.com/dotnet/runtime/issues/33002
2072
- nuint finalOffsetWhereCanRunLoop = elementCount - ( uint ) Vector256 < byte > . Count ;
2073
-
2074
- do
2075
- {
2076
- Vector256 < byte > asciiVector = Vector256 . Load ( pAsciiBuffer + currentOffset ) ;
2077
-
2078
- if ( asciiVector . ExtractMostSignificantBits ( ) != 0 )
2079
- {
2080
- break ;
2081
- }
2082
-
2083
- ( Vector256 < ushort > utf16LowVector , Vector256 < ushort > utf16HighVector ) = Vector256 . Widen ( asciiVector ) ;
2084
- utf16LowVector . Store ( pCurrentWriteAddress ) ;
2085
- utf16HighVector . Store ( pCurrentWriteAddress + Vector256 < ushort > . Count ) ;
2086
-
2087
- currentOffset += ( nuint ) Vector256 < byte > . Count ;
2088
- pCurrentWriteAddress += ( nuint ) Vector256 < byte > . Count ;
2089
- } while ( currentOffset < = finalOffsetWhereCanRunLoop ) ;
2047
+ WidenAsciiToUtf1_Vector < Vector256 < byte > , Vector256 < ushort > > ( pAsciiBuffer , pUtf16Buffer , ref currentOffset , elementCount ) ;
2090
2048
}
2091
- else
2049
+ else if ( Vector128 . IsHardwareAccelerated && ( elementCount - currentOffset ) >= ( uint ) Vector128 < byte > . Count )
2092
2050
{
2093
- // Calculating the destination address outside the loop results in significant
2094
- // perf wins vs. relying on the JIT to fold memory addressing logic into the
2095
- // write instructions. See: https://github.com/dotnet/runtime/issues/33002
2096
- nuint finalOffsetWhereCanRunLoop = elementCount - ( uint ) Vector128 < byte > . Count ;
2097
-
2098
- do
2099
- {
2100
- Vector128 < byte > asciiVector = Vector128. Load( pAsciiBuffer + currentOffset) ;
2101
-
2102
- if ( VectorContainsNonAsciiChar( asciiVector) )
2103
- {
2104
- break ;
2105
- }
2106
-
2107
- ( Vector128< ushort > utf16LowVector , Vector128 < ushort > utf16HighVector ) = Vector128 . Widen ( asciiVector ) ;
2108
- utf16LowVector. Store( pCurrentWriteAddress) ;
2109
- utf16HighVector. Store( pCurrentWriteAddress + Vector128< ushort > . Count) ;
2110
-
2111
- currentOffset += ( nuint ) Vector128< byte > . Count;
2112
- pCurrentWriteAddress += ( nuint ) Vector128< byte > . Count;
2113
- } while ( currentOffset <= finalOffsetWhereCanRunLoop) ;
2051
+ WidenAsciiToUtf1_Vector < Vector128 < byte > , Vector128 < ushort > > ( pAsciiBuffer , pUtf16Buffer , ref currentOffset , elementCount ) ;
2114
2052
}
2115
2053
}
2116
2054
@@ -2212,6 +2150,85 @@ internal static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16B
2212
2150
goto Finish ;
2213
2151
}
2214
2152
2153
+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
2154
+ private static unsafe void WidenAsciiToUtf1_Vector< TVectorByte, TVectorUInt16 > ( byte * pAsciiBuffer , char * pUtf16Buffer , ref nuint currentOffset , nuint elementCount )
2155
+ where TVectorByte : unmanaged , ISimdVector < TVectorByte , byte >
2156
+ where TVectorUInt16 : unmanaged , ISimdVector < TVectorUInt16 , ushort >
2157
+ {
2158
+ ushort * pCurrentWriteAddress = ( ushort * ) pUtf16Buffer;
2159
+ // Calculating the destination address outside the loop results in significant
2160
+ // perf wins vs. relying on the JIT to fold memory addressing logic into the
2161
+ // write instructions. See: https://github.com/dotnet/runtime/issues/33002
2162
+ nuint finalOffsetWhereCanRunLoop = elementCount - ( nuint ) TVectorByte . Count ;
2163
+ TVectorByte asciiVector = TVectorByte . Load ( pAsciiBuffer + currentOffset ) ;
2164
+ if ( ! HasMatch < TVectorByte > ( asciiVector ) )
2165
+ {
2166
+ ( TVectorUInt16 utf16LowVector , TVectorUInt16 utf16HighVector ) = Widen < TVectorByte , TVectorUInt16 > ( asciiVector ) ;
2167
+ utf16LowVector . Store ( pCurrentWriteAddress ) ;
2168
+ utf16HighVector . Store ( pCurrentWriteAddress + TVectorUInt16 . Count ) ;
2169
+ pCurrentWriteAddress += ( nuint ) ( TVectorUInt16 . Count * 2 ) ;
2170
+ if ( ( ( nuint ) pCurrentWriteAddress % sizeof ( char ) ) == 0 )
2171
+ {
2172
+ // Bump write buffer up to the next aligned boundary
2173
+ pCurrentWriteAddress = ( ushort * ) ( ( nuint ) pCurrentWriteAddress & ~ ( nuint ) ( TVectorUInt16. Alignment - 1 ) ) ;
2174
+ nuint numBytesWritten = ( nuint ) pCurrentWriteAddress - ( nuint ) pUtf16Buffer;
2175
+ currentOffset += ( nuint ) numBytesWritten / 2 ;
2176
+ }
2177
+ else
2178
+ {
2179
+ // If input isn't char aligned, we won't be able to align it to a Vector
2180
+ currentOffset += ( nuint ) TVectorByte. Count;
2181
+ }
2182
+ while ( currentOffset <= finalOffsetWhereCanRunLoop)
2183
+ {
2184
+ asciiVector = TVectorByte. Load( pAsciiBuffer + currentOffset) ;
2185
+ if ( HasMatch< TVectorByte> ( asciiVector) )
2186
+ {
2187
+ break ;
2188
+ }
2189
+ ( utf16LowVector, utf16HighVector) = Widen< TVectorByte, TVectorUInt16> ( asciiVector) ;
2190
+ utf16LowVector. Store( pCurrentWriteAddress) ;
2191
+ utf16HighVector. Store( pCurrentWriteAddress + TVectorUInt16. Count) ;
2192
+
2193
+ currentOffset += ( nuint ) TVectorByte. Count;
2194
+ pCurrentWriteAddress += ( nuint ) ( TVectorUInt16. Count * 2 ) ;
2195
+ }
2196
+ }
2197
+ return ;
2198
+ }
2199
+
2200
+ [ MethodImpl( MethodImplOptions. AggressiveInlining) ]
2201
+ private static unsafe bool HasMatch< TVectorByte> ( TVectorByte vector)
2202
+ where TVectorByte : unmanaged , ISimdVector < TVectorByte , byte >
2203
+ {
2204
+ return ! ( vector & TVectorByte . Create ( ( byte ) 0x80 ) ) . Equals ( TVectorByte . Zero ) ;
2205
+ }
2206
+
2207
+
2208
+ [ MethodImpl( MethodImplOptions. AggressiveInlining) ]
2209
+ private static unsafe ( TVectorUInt16 Lower, TVectorUInt16 Upper ) Widen < TVectorByte, TVectorUInt16 > ( TVectorByte vector)
2210
+ where TVectorByte : unmanaged , ISimdVector < TVectorByte , byte >
2211
+ where TVectorUInt16 : unmanaged , ISimdVector < TVectorUInt16 , ushort >
2212
+ {
2213
+ if ( typeof ( TVectorByte ) == typeof ( Vector256 < byte > ) )
2214
+ {
2215
+ ( Vector256 < ushort > Lower256 , Vector256 < ushort > Upper256 ) = Vector256 . Widen ( ( Vector256 < byte > ) ( object ) vector) ;
2216
+ return ( ( TVectorUInt16 ) ( object ) Lower256, ( TVectorUInt16 ) ( object ) Upper256) ;
2217
+ }
2218
+ else if ( typeof ( TVectorByte ) == typeof ( Vector512 < byte > ) )
2219
+ {
2220
+ ( Vector512 < ushort > Lower512 , Vector512 < ushort > Upper512 ) = Vector512 . Widen ( ( Vector512 < byte > ) ( object ) vector) ;
2221
+ return ( ( TVectorUInt16 ) ( object ) Lower512, ( TVectorUInt16 ) ( object ) Upper512) ;
2222
+ }
2223
+ else
2224
+ {
2225
+ Debug . Assert ( typeof ( TVectorByte ) == typeof ( Vector128 < byte > ) ) ;
2226
+ ( Vector128 < ushort > Lower128 , Vector128 < ushort > Upper128 ) = Vector128 . Widen ( ( Vector128 < byte > ) ( object ) vector) ;
2227
+ return ( ( TVectorUInt16 ) ( object ) Lower128, ( TVectorUInt16 ) ( object ) Upper128) ;
2228
+ }
2229
+ }
2230
+
2231
+
2215
2232
/// <summary>
2216
2233
/// Given a DWORD which represents a buffer of 4 bytes, widens the buffer into 4 WORDs and
2217
2234
/// writes them to the output buffer with machine endianness.
0 commit comments