-
Notifications
You must be signed in to change notification settings - Fork 2.6k
Decrease writes to local variables in Buffer.MemoryCopy #6627
Changes from 15 commits
d3fea7b
45d73dc
2f00e0c
d50a239
d35dae6
0f91eab
beea7a4
8278b61
c440918
6c3dbdd
a5602d2
24cdaa0
a08ba07
3a39eae
3eccc83
1c5b505
89abc69
7a697a2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,6 +15,12 @@ namespace System { | |
using System.Security; | ||
using System.Runtime; | ||
|
||
#if BIT64 | ||
using nuint = System.UInt64; | ||
#else // BIT64 | ||
using nuint = System.UInt32; | ||
#endif // BIT64 | ||
|
||
[System.Runtime.InteropServices.ComVisible(true)] | ||
public static class Buffer | ||
{ | ||
|
@@ -264,25 +270,23 @@ internal unsafe static void Memcpy(byte* dest, byte* src, int len) { | |
// This method has different signature for x64 and other platforms and is done for performance reasons. | ||
[System.Security.SecurityCritical] | ||
[ReliabilityContract(Consistency.WillNotCorruptState, Cer.Success)] | ||
#if BIT64 | ||
internal unsafe static void Memmove(byte* dest, byte* src, ulong len) | ||
#else | ||
internal unsafe static void Memmove(byte* dest, byte* src, uint len) | ||
#endif | ||
internal unsafe static void Memmove(byte* dest, byte* src, nuint len) | ||
{ | ||
// P/Invoke into the native version when the buffers are overlapping and the copy needs to be performed backwards | ||
// This check can produce false positives for lengths greater than Int32.MaxInt. It is fine because we want to use PInvoke path for the large lengths anyway. | ||
#if BIT64 | ||
if ((ulong)dest - (ulong)src < len) goto PInvoke; | ||
#else | ||
if (((uint)dest - (uint)src) < len) goto PInvoke; | ||
#endif | ||
// | ||
|
||
if ((nuint)dest - (nuint)src < len) goto PInvoke; | ||
|
||
// This is portable version of memcpy. It mirrors what the hand optimized assembly versions of memcpy typically do. | ||
// | ||
// Ideally, we would just use the cpblk IL instruction here. Unfortunately, cpblk IL instruction is not as efficient as | ||
// possible yet and so we have this implementation here for now. | ||
// | ||
|
||
// Note: It's important that this switch handles lengths at least up to 22. | ||
// See notes below near the main loop for why. | ||
|
||
// The switch will be very fast since it can be implemented using a jump | ||
// table in assembly. See http://stackoverflow.com/a/449297/4077294 for more info. | ||
|
||
switch (len) | ||
{ | ||
|
@@ -292,14 +296,14 @@ internal unsafe static void Memmove(byte* dest, byte* src, uint len) | |
*dest = *src; | ||
return; | ||
case 2: | ||
*(short *)dest = *(short *)src; | ||
*(short*)dest = *(short*)src; | ||
return; | ||
case 3: | ||
*(short *)dest = *(short *)src; | ||
*(short*)dest = *(short*)src; | ||
*(dest + 2) = *(src + 2); | ||
return; | ||
case 4: | ||
*(int *)dest = *(int *)src; | ||
*(int*)dest = *(int*)src; | ||
return; | ||
case 5: | ||
*(int*)dest = *(int*)src; | ||
|
@@ -401,87 +405,173 @@ internal unsafe static void Memmove(byte* dest, byte* src, uint len) | |
*(int*)(dest + 12) = *(int*)(src + 12); | ||
#endif | ||
return; | ||
default: | ||
break; | ||
case 17: | ||
#if BIT64 | ||
*(long*)dest = *(long*)src; | ||
*(long*)(dest + 8) = *(long*)(src + 8); | ||
#else | ||
*(int*)dest = *(int*)src; | ||
*(int*)(dest + 4) = *(int*)(src + 4); | ||
*(int*)(dest + 8) = *(int*)(src + 8); | ||
*(int*)(dest + 12) = *(int*)(src + 12); | ||
#endif | ||
*(dest + 16) = *(src + 16); | ||
return; | ||
case 18: | ||
#if BIT64 | ||
*(long*)dest = *(long*)src; | ||
*(long*)(dest + 8) = *(long*)(src + 8); | ||
#else | ||
*(int*)dest = *(int*)src; | ||
*(int*)(dest + 4) = *(int*)(src + 4); | ||
*(int*)(dest + 8) = *(int*)(src + 8); | ||
*(int*)(dest + 12) = *(int*)(src + 12); | ||
#endif | ||
*(short*)(dest + 16) = *(short*)(src + 16); | ||
return; | ||
case 19: | ||
#if BIT64 | ||
*(long*)dest = *(long*)src; | ||
*(long*)(dest + 8) = *(long*)(src + 8); | ||
#else | ||
*(int*)dest = *(int*)src; | ||
*(int*)(dest + 4) = *(int*)(src + 4); | ||
*(int*)(dest + 8) = *(int*)(src + 8); | ||
*(int*)(dest + 12) = *(int*)(src + 12); | ||
#endif | ||
*(short*)(dest + 16) = *(short*)(src + 16); | ||
*(dest + 18) = *(src + 18); | ||
return; | ||
case 20: | ||
#if BIT64 | ||
*(long*)dest = *(long*)src; | ||
*(long*)(dest + 8) = *(long*)(src + 8); | ||
#else | ||
*(int*)dest = *(int*)src; | ||
*(int*)(dest + 4) = *(int*)(src + 4); | ||
*(int*)(dest + 8) = *(int*)(src + 8); | ||
*(int*)(dest + 12) = *(int*)(src + 12); | ||
#endif | ||
*(int*)(dest + 16) = *(int*)(src + 16); | ||
return; | ||
case 21: | ||
#if BIT64 | ||
*(long*)dest = *(long*)src; | ||
*(long*)(dest + 8) = *(long*)(src + 8); | ||
#else | ||
*(int*)dest = *(int*)src; | ||
*(int*)(dest + 4) = *(int*)(src + 4); | ||
*(int*)(dest + 8) = *(int*)(src + 8); | ||
*(int*)(dest + 12) = *(int*)(src + 12); | ||
#endif | ||
*(int*)(dest + 16) = *(int*)(src + 16); | ||
*(dest + 20) = *(src + 20); | ||
return; | ||
case 22: | ||
#if BIT64 | ||
*(long*)dest = *(long*)src; | ||
*(long*)(dest + 8) = *(long*)(src + 8); | ||
#else | ||
*(int*)dest = *(int*)src; | ||
*(int*)(dest + 4) = *(int*)(src + 4); | ||
*(int*)(dest + 8) = *(int*)(src + 8); | ||
*(int*)(dest + 12) = *(int*)(src + 12); | ||
#endif | ||
*(int*)(dest + 16) = *(int*)(src + 16); | ||
*(short*)(dest + 20) = *(short*)(src + 20); | ||
return; | ||
} | ||
|
||
// P/Invoke into the native version for large lengths | ||
if (len >= 512) goto PInvoke; | ||
|
||
nuint i = 0; // byte offset at which we're copying | ||
|
||
if (((int)dest & 3) != 0) | ||
{ | ||
if (((int)dest & 1) != 0) | ||
{ | ||
*dest = *src; | ||
src++; | ||
dest++; | ||
len--; | ||
if (((int)dest & 2) == 0) | ||
goto Aligned; | ||
*(dest + i) = *(src + i); | ||
i += 1; | ||
if (((int)dest & 2) != 0) | ||
goto IntAligned; | ||
} | ||
*(short *)dest = *(short *)src; | ||
src += 2; | ||
dest += 2; | ||
len -= 2; | ||
Aligned: ; | ||
*(short*)(dest + i) = *(short*)(src + i); | ||
i += 2; | ||
} | ||
|
||
IntAligned: | ||
|
||
#if BIT64 | ||
if (((int)dest & 4) != 0) | ||
if ((((int)dest - 1) & 4) == 0) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would be nice to add comment to this condition |
||
{ | ||
*(int *)dest = *(int *)src; | ||
src += 4; | ||
dest += 4; | ||
len -= 4; | ||
*(int*)(dest + i) = *(int*)(src + i); | ||
i += 4; | ||
} | ||
#endif | ||
|
||
#if BIT64 | ||
ulong count = len / 16; | ||
#else | ||
uint count = len / 16; | ||
#endif | ||
while (count > 0) | ||
nuint end = len - 16; | ||
|
||
// We know due to the above switch-case that | ||
// this loop will always run 1 iteration; max | ||
// bytes we copy before checking is 23 (7 to | ||
// align the pointers, 16 for 1 iteration) so | ||
// the switch handles lengths 0-22. | ||
Contract.Assert(end >= 7 && i <= end); | ||
|
||
do | ||
{ | ||
// This loop looks very costly since there | ||
// appear to be a bunch of temporary values | ||
// being created with the adds, but the jit | ||
// (for x86 anyways) will convert each of | ||
// these to use memory addressing operands. | ||
// So the only cost is a bit of code size, | ||
// which is made up for by the fact that | ||
// we save on writes to dest/src. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The lines seem to be a little short, you should break at least only after 80 chars |
||
|
||
#if BIT64 | ||
((long*)dest)[0] = ((long*)src)[0]; | ||
((long*)dest)[1] = ((long*)src)[1]; | ||
*(long*)(dest + i) = *(long*)(src + i); | ||
*(long*)(dest + i + 8) = *(long*)(src + i + 8); | ||
#else | ||
((int*)dest)[0] = ((int*)src)[0]; | ||
((int*)dest)[1] = ((int*)src)[1]; | ||
((int*)dest)[2] = ((int*)src)[2]; | ||
((int*)dest)[3] = ((int*)src)[3]; | ||
*(int*)(dest + i) = *(int*)(src + i); | ||
*(int*)(dest + i + 4) = *(int*)(src + i + 4); | ||
*(int*)(dest + i + 8) = *(int*)(src + i + 8); | ||
*(int*)(dest + i + 12) = *(int*)(src + i + 12); | ||
#endif | ||
dest += 16; | ||
src += 16; | ||
count--; | ||
|
||
i += 16; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Split the r+w dependency of int counter;
do {
counter = i + 16;
*(long*)(dest + i) = *(long*)(src + i);
// ...
i = counter;
} while (counter <= end); There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for suggesting this-- it seems to be speeding the inner loop up by about 5%, from what I can tell after repeated runs of the tests on x64. |
||
} | ||
while (i <= end); | ||
|
||
len -= i; // len now represents how many bytes are left after the unrolled loop | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Before the loop? Is it dependent on the result in If not its result can be prepared well ahead of the next uses on the lines below. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @benaadams |
||
|
||
if ((len & 8) != 0) | ||
{ | ||
#if BIT64 | ||
((long*)dest)[0] = ((long*)src)[0]; | ||
*(long*)(dest + i) = *(long*)(src + i); | ||
#else | ||
((int*)dest)[0] = ((int*)src)[0]; | ||
((int*)dest)[1] = ((int*)src)[1]; | ||
*(int*)(dest + i) = *(int*)(src + i); | ||
*(int*)(dest + i + 4) = *(int*)(src + i + 4); | ||
#endif | ||
dest += 8; | ||
src += 8; | ||
} | ||
if ((len & 4) != 0) | ||
{ | ||
((int*)dest)[0] = ((int*)src)[0]; | ||
dest += 4; | ||
src += 4; | ||
} | ||
if ((len & 2) != 0) | ||
{ | ||
((short*)dest)[0] = ((short*)src)[0]; | ||
dest += 2; | ||
src += 2; | ||
} | ||
if ((len & 1) != 0) | ||
*dest = *src; | ||
i += 8; | ||
} | ||
if ((len & 4) != 0) | ||
{ | ||
*(int*)(dest + i) = *(int*)(src + i); | ||
i += 4; | ||
} | ||
if ((len & 2) != 0) | ||
{ | ||
*(short*)(dest + i) = *(short*)(src + i); | ||
i += 2; | ||
} | ||
if ((len & 1) != 0) | ||
{ | ||
*(dest + i) = *(src + i); | ||
// We're not using i after this, so not needed | ||
// i += 1; | ||
} | ||
|
||
return; | ||
|
||
|
@@ -495,11 +585,7 @@ internal unsafe static void Memmove(byte* dest, byte* src, uint len) | |
[SecurityCritical] | ||
[ReliabilityContract(Consistency.WillNotCorruptState, Cer.Success)] | ||
[MethodImplAttribute(MethodImplOptions.NoInlining)] | ||
#if BIT64 | ||
private unsafe static void _Memmove(byte* dest, byte* src, ulong len) | ||
#else | ||
private unsafe static void _Memmove(byte* dest, byte* src, uint len) | ||
#endif | ||
private unsafe static void _Memmove(byte* dest, byte* src, nuint len) | ||
{ | ||
__Memmove(dest, src, len); | ||
} | ||
|
@@ -508,12 +594,7 @@ private unsafe static void _Memmove(byte* dest, byte* src, uint len) | |
[SuppressUnmanagedCodeSecurity] | ||
[SecurityCritical] | ||
[ReliabilityContract(Consistency.WillNotCorruptState, Cer.Success)] | ||
#if BIT64 | ||
extern private unsafe static void __Memmove(byte* dest, byte* src, ulong len); | ||
#else | ||
extern private unsafe static void __Memmove(byte* dest, byte* src, uint len); | ||
#endif | ||
|
||
extern private unsafe static void __Memmove(byte* dest, byte* src, nuint len); | ||
|
||
// The attributes on this method are chosen for best JIT performance. | ||
// Please do not edit unless intentional. | ||
|
@@ -526,11 +607,7 @@ public static unsafe void MemoryCopy(void* source, void* destination, long desti | |
{ | ||
ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.sourceBytesToCopy); | ||
} | ||
#if BIT64 | ||
Memmove((byte*)destination, (byte*)source, checked((ulong) sourceBytesToCopy)); | ||
#else | ||
Memmove((byte*)destination, (byte*)source, checked((uint)sourceBytesToCopy)); | ||
#endif // BIT64 | ||
Memmove((byte*)destination, (byte*)source, checked((nuint)sourceBytesToCopy)); | ||
} | ||
|
||
|
||
|
@@ -547,7 +624,7 @@ public static unsafe void MemoryCopy(void* source, void* destination, ulong dest | |
} | ||
#if BIT64 | ||
Memmove((byte*)destination, (byte*)source, sourceBytesToCopy); | ||
#else | ||
#else // BIT64 | ||
Memmove((byte*)destination, (byte*)source, checked((uint)sourceBytesToCopy)); | ||
#endif // BIT64 | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit:
i
is always zero hereThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@bbowyersmyth Yep, I included that just in case someone changes the code in the future. From what I can see it doesn't make a difference since the JIT is able to do constant propogation here and
i += 1
is just compiled tomov i, 1
.