Skip to content
This repository was archived by the owner on Jan 23, 2023. It is now read-only.

Decrease writes to local variables in Buffer.MemoryCopy #6627

Merged
merged 18 commits into from
Aug 6, 2016
Merged
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
247 changes: 162 additions & 85 deletions src/mscorlib/src/System/Buffer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,12 @@ namespace System {
using System.Security;
using System.Runtime;

#if BIT64
using nuint = System.UInt64;
#else // BIT64
using nuint = System.UInt32;
#endif // BIT64

[System.Runtime.InteropServices.ComVisible(true)]
public static class Buffer
{
Expand Down Expand Up @@ -264,25 +270,23 @@ internal unsafe static void Memcpy(byte* dest, byte* src, int len) {
// This method has different signature for x64 and other platforms and is done for performance reasons.
[System.Security.SecurityCritical]
[ReliabilityContract(Consistency.WillNotCorruptState, Cer.Success)]
#if BIT64
internal unsafe static void Memmove(byte* dest, byte* src, ulong len)
#else
internal unsafe static void Memmove(byte* dest, byte* src, uint len)
#endif
internal unsafe static void Memmove(byte* dest, byte* src, nuint len)
{
// P/Invoke into the native version when the buffers are overlapping and the copy needs to be performed backwards
// This check can produce false positives for lengths greater than Int32.MaxInt. It is fine because we want to use PInvoke path for the large lengths anyway.
#if BIT64
if ((ulong)dest - (ulong)src < len) goto PInvoke;
#else
if (((uint)dest - (uint)src) < len) goto PInvoke;
#endif
//

if ((nuint)dest - (nuint)src < len) goto PInvoke;

// This is portable version of memcpy. It mirrors what the hand optimized assembly versions of memcpy typically do.
//
// Ideally, we would just use the cpblk IL instruction here. Unfortunately, cpblk IL instruction is not as efficient as
// possible yet and so we have this implementation here for now.
//

// Note: It's important that this switch handles lengths at least up to 22.
// See notes below near the main loop for why.

// The switch will be very fast since it can be implemented using a jump
// table in assembly. See http://stackoverflow.com/a/449297/4077294 for more info.

switch (len)
{
Expand All @@ -292,14 +296,14 @@ internal unsafe static void Memmove(byte* dest, byte* src, uint len)
*dest = *src;
return;
case 2:
*(short *)dest = *(short *)src;
*(short*)dest = *(short*)src;
return;
case 3:
*(short *)dest = *(short *)src;
*(short*)dest = *(short*)src;
*(dest + 2) = *(src + 2);
return;
case 4:
*(int *)dest = *(int *)src;
*(int*)dest = *(int*)src;
return;
case 5:
*(int*)dest = *(int*)src;
Expand Down Expand Up @@ -401,87 +405,173 @@ internal unsafe static void Memmove(byte* dest, byte* src, uint len)
*(int*)(dest + 12) = *(int*)(src + 12);
#endif
return;
default:
break;
case 17:
#if BIT64
*(long*)dest = *(long*)src;
*(long*)(dest + 8) = *(long*)(src + 8);
#else
*(int*)dest = *(int*)src;
*(int*)(dest + 4) = *(int*)(src + 4);
*(int*)(dest + 8) = *(int*)(src + 8);
*(int*)(dest + 12) = *(int*)(src + 12);
#endif
*(dest + 16) = *(src + 16);
return;
case 18:
#if BIT64
*(long*)dest = *(long*)src;
*(long*)(dest + 8) = *(long*)(src + 8);
#else
*(int*)dest = *(int*)src;
*(int*)(dest + 4) = *(int*)(src + 4);
*(int*)(dest + 8) = *(int*)(src + 8);
*(int*)(dest + 12) = *(int*)(src + 12);
#endif
*(short*)(dest + 16) = *(short*)(src + 16);
return;
case 19:
#if BIT64
*(long*)dest = *(long*)src;
*(long*)(dest + 8) = *(long*)(src + 8);
#else
*(int*)dest = *(int*)src;
*(int*)(dest + 4) = *(int*)(src + 4);
*(int*)(dest + 8) = *(int*)(src + 8);
*(int*)(dest + 12) = *(int*)(src + 12);
#endif
*(short*)(dest + 16) = *(short*)(src + 16);
*(dest + 18) = *(src + 18);
return;
case 20:
#if BIT64
*(long*)dest = *(long*)src;
*(long*)(dest + 8) = *(long*)(src + 8);
#else
*(int*)dest = *(int*)src;
*(int*)(dest + 4) = *(int*)(src + 4);
*(int*)(dest + 8) = *(int*)(src + 8);
*(int*)(dest + 12) = *(int*)(src + 12);
#endif
*(int*)(dest + 16) = *(int*)(src + 16);
return;
case 21:
#if BIT64
*(long*)dest = *(long*)src;
*(long*)(dest + 8) = *(long*)(src + 8);
#else
*(int*)dest = *(int*)src;
*(int*)(dest + 4) = *(int*)(src + 4);
*(int*)(dest + 8) = *(int*)(src + 8);
*(int*)(dest + 12) = *(int*)(src + 12);
#endif
*(int*)(dest + 16) = *(int*)(src + 16);
*(dest + 20) = *(src + 20);
return;
case 22:
#if BIT64
*(long*)dest = *(long*)src;
*(long*)(dest + 8) = *(long*)(src + 8);
#else
*(int*)dest = *(int*)src;
*(int*)(dest + 4) = *(int*)(src + 4);
*(int*)(dest + 8) = *(int*)(src + 8);
*(int*)(dest + 12) = *(int*)(src + 12);
#endif
*(int*)(dest + 16) = *(int*)(src + 16);
*(short*)(dest + 20) = *(short*)(src + 20);
return;
}

// P/Invoke into the native version for large lengths
if (len >= 512) goto PInvoke;

nuint i = 0; // byte offset at which we're copying

if (((int)dest & 3) != 0)
{
if (((int)dest & 1) != 0)
{
*dest = *src;
src++;
dest++;
len--;
if (((int)dest & 2) == 0)
goto Aligned;
*(dest + i) = *(src + i);

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: i is always zero here

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@bbowyersmyth Yep, I included that just in case someone changes the code in the future. From what I can see it doesn't make a difference since the JIT is able to do constant propogation here and i += 1 is just compiled to mov i, 1.

i += 1;
if (((int)dest & 2) != 0)
goto IntAligned;
}
*(short *)dest = *(short *)src;
src += 2;
dest += 2;
len -= 2;
Aligned: ;
*(short*)(dest + i) = *(short*)(src + i);
i += 2;
}

IntAligned:

#if BIT64
if (((int)dest & 4) != 0)
if ((((int)dest - 1) & 4) == 0)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be nice to add comment to this condition

{
*(int *)dest = *(int *)src;
src += 4;
dest += 4;
len -= 4;
*(int*)(dest + i) = *(int*)(src + i);
i += 4;
}
#endif

#if BIT64
ulong count = len / 16;
#else
uint count = len / 16;
#endif
while (count > 0)
nuint end = len - 16;

// We know due to the above switch-case that
// this loop will always run 1 iteration; max
// bytes we copy before checking is 23 (7 to
// align the pointers, 16 for 1 iteration) so
// the switch handles lengths 0-22.
Contract.Assert(end >= 7 && i <= end);

do
{
// This loop looks very costly since there
// appear to be a bunch of temporary values
// being created with the adds, but the jit
// (for x86 anyways) will convert each of
// these to use memory addressing operands.
// So the only cost is a bit of code size,
// which is made up for by the fact that
// we save on writes to dest/src.
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The lines seem to be a little short, you should break at least only after 80 chars


#if BIT64
((long*)dest)[0] = ((long*)src)[0];
((long*)dest)[1] = ((long*)src)[1];
*(long*)(dest + i) = *(long*)(src + i);
*(long*)(dest + i + 8) = *(long*)(src + i + 8);
#else
((int*)dest)[0] = ((int*)src)[0];
((int*)dest)[1] = ((int*)src)[1];
((int*)dest)[2] = ((int*)src)[2];
((int*)dest)[3] = ((int*)src)[3];
*(int*)(dest + i) = *(int*)(src + i);
*(int*)(dest + i + 4) = *(int*)(src + i + 4);
*(int*)(dest + i + 8) = *(int*)(src + i + 8);
*(int*)(dest + i + 12) = *(int*)(src + i + 12);
#endif
dest += 16;
src += 16;
count--;

i += 16;
Copy link
Member

@benaadams benaadams Aug 5, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Split the r+w dependency of i+=16; and perform the addition at the start of the pipeline so its value is known when the test is reached and the cpu can predict the correct branches?

int counter;
do {
    counter = i + 16;
    *(long*)(dest + i) = *(long*)(src + i);
    // ...
    i = counter;
} while (counter <= end);

Copy link
Author

@jamesqo jamesqo Aug 5, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for suggesting this-- it seems to be speeding the inner loop up by about 5%, from what I can tell after repeated runs of the tests on x64.

}
while (i <= end);

len -= i; // len now represents how many bytes are left after the unrolled loop
Copy link
Member

@benaadams benaadams Aug 5, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

len -= end;

Before the loop? Is it dependent on the result in i? (not sure as loop is <=)

If not its result can be prepared well ahead of the next uses on the lines below.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@benaadams end is just len - 16, so that would always be 16. I suppose this could come before the loop since i is always incremented by 16 however, so the lower 4 bits would still be the same.


if ((len & 8) != 0)
{
#if BIT64
((long*)dest)[0] = ((long*)src)[0];
*(long*)(dest + i) = *(long*)(src + i);
#else
((int*)dest)[0] = ((int*)src)[0];
((int*)dest)[1] = ((int*)src)[1];
*(int*)(dest + i) = *(int*)(src + i);
*(int*)(dest + i + 4) = *(int*)(src + i + 4);
#endif
dest += 8;
src += 8;
}
if ((len & 4) != 0)
{
((int*)dest)[0] = ((int*)src)[0];
dest += 4;
src += 4;
}
if ((len & 2) != 0)
{
((short*)dest)[0] = ((short*)src)[0];
dest += 2;
src += 2;
}
if ((len & 1) != 0)
*dest = *src;
i += 8;
}
if ((len & 4) != 0)
{
*(int*)(dest + i) = *(int*)(src + i);
i += 4;
}
if ((len & 2) != 0)
{
*(short*)(dest + i) = *(short*)(src + i);
i += 2;
}
if ((len & 1) != 0)
{
*(dest + i) = *(src + i);
// We're not using i after this, so not needed
// i += 1;
}

return;

Expand All @@ -495,11 +585,7 @@ internal unsafe static void Memmove(byte* dest, byte* src, uint len)
[SecurityCritical]
[ReliabilityContract(Consistency.WillNotCorruptState, Cer.Success)]
[MethodImplAttribute(MethodImplOptions.NoInlining)]
#if BIT64
private unsafe static void _Memmove(byte* dest, byte* src, ulong len)
#else
private unsafe static void _Memmove(byte* dest, byte* src, uint len)
#endif
private unsafe static void _Memmove(byte* dest, byte* src, nuint len)
{
__Memmove(dest, src, len);
}
Expand All @@ -508,12 +594,7 @@ private unsafe static void _Memmove(byte* dest, byte* src, uint len)
[SuppressUnmanagedCodeSecurity]
[SecurityCritical]
[ReliabilityContract(Consistency.WillNotCorruptState, Cer.Success)]
#if BIT64
extern private unsafe static void __Memmove(byte* dest, byte* src, ulong len);
#else
extern private unsafe static void __Memmove(byte* dest, byte* src, uint len);
#endif

extern private unsafe static void __Memmove(byte* dest, byte* src, nuint len);

// The attributes on this method are chosen for best JIT performance.
// Please do not edit unless intentional.
Expand All @@ -526,11 +607,7 @@ public static unsafe void MemoryCopy(void* source, void* destination, long desti
{
ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.sourceBytesToCopy);
}
#if BIT64
Memmove((byte*)destination, (byte*)source, checked((ulong) sourceBytesToCopy));
#else
Memmove((byte*)destination, (byte*)source, checked((uint)sourceBytesToCopy));
#endif // BIT64
Memmove((byte*)destination, (byte*)source, checked((nuint)sourceBytesToCopy));
}


Expand All @@ -547,7 +624,7 @@ public static unsafe void MemoryCopy(void* source, void* destination, ulong dest
}
#if BIT64
Memmove((byte*)destination, (byte*)source, sourceBytesToCopy);
#else
#else // BIT64
Memmove((byte*)destination, (byte*)source, checked((uint)sourceBytesToCopy));
#endif // BIT64
}
Expand Down