Skip to content

Commit fc84650

Browse files
jamesqojkotas
authored andcommitted
Decrease writes to local variables in Buffer.MemoryCopy (dotnet/coreclr#6627)
In `Buffer.MemoryCopy` currently we are making 4 writes every time we copy some data; 1 to update `*dest`, 1 to update `dest`, 1 to update `src` and 1 to update `len`. I've decreased it to 2; one to update a new local variable `i`, which keeps track of how many bytes we are into the buffer. All writes are now made using ```cs *(dest + i + x) = *(src + i + x) ``` which has no additional overhead since they're converted to using memory addressing operands by the jit. Another change I made was to add a few extra cases for the switch-case at the beginning that does copying for small sizes without any branches. It now covers sizes 0-22. This is beneficial to the main codepath, since we can convert the unrolled loop to a `do..while` loop and save an extra branch at the beginning. (max 7 bytes for alignment, 16 for 1 iteration of the loop, so the min bytes we can copy without checking whether we should stop is 23.) This adds This PR increases the performance of `MemoryCopy` by 10-20% for most buffer sizes on x86; you can see the performance test/results (and the generated assembly for each version) [here](https://gist.github.com/jamesqo/337852c8ce09205a8289ce1f1b9b5382). (Note that this codepath is also used by `wstrcpy` at the moment, so this directly affects many common String operations.) Commit migrated from dotnet/coreclr@32fe063
1 parent 5bc7eba commit fc84650

File tree

1 file changed

+176
-86
lines changed

1 file changed

+176
-86
lines changed

src/coreclr/src/mscorlib/src/System/Buffer.cs

Lines changed: 176 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,12 @@ namespace System {
1515
using System.Security;
1616
using System.Runtime;
1717

18+
#if BIT64
19+
using nuint = System.UInt64;
20+
#else // BIT64
21+
using nuint = System.UInt32;
22+
#endif // BIT64
23+
1824
[System.Runtime.InteropServices.ComVisible(true)]
1925
public static class Buffer
2026
{
@@ -264,25 +270,23 @@ internal unsafe static void Memcpy(byte* dest, byte* src, int len) {
264270
// This method has different signature for x64 and other platforms and is done for performance reasons.
265271
[System.Security.SecurityCritical]
266272
[ReliabilityContract(Consistency.WillNotCorruptState, Cer.Success)]
267-
#if BIT64
268-
internal unsafe static void Memmove(byte* dest, byte* src, ulong len)
269-
#else
270-
internal unsafe static void Memmove(byte* dest, byte* src, uint len)
271-
#endif
273+
internal unsafe static void Memmove(byte* dest, byte* src, nuint len)
272274
{
273275
// P/Invoke into the native version when the buffers are overlapping and the copy needs to be performed backwards
274276
// This check can produce false positives for lengths greater than Int32.MaxInt. It is fine because we want to use PInvoke path for the large lengths anyway.
275-
#if BIT64
276-
if ((ulong)dest - (ulong)src < len) goto PInvoke;
277-
#else
278-
if (((uint)dest - (uint)src) < len) goto PInvoke;
279-
#endif
280-
//
277+
278+
if ((nuint)dest - (nuint)src < len) goto PInvoke;
279+
281280
// This is portable version of memcpy. It mirrors what the hand optimized assembly versions of memcpy typically do.
282281
//
283282
// Ideally, we would just use the cpblk IL instruction here. Unfortunately, cpblk IL instruction is not as efficient as
284283
// possible yet and so we have this implementation here for now.
285-
//
284+
285+
// Note: It's important that this switch handles lengths at least up to 22.
286+
// See notes below near the main loop for why.
287+
288+
// The switch will be very fast since it can be implemented using a jump
289+
// table in assembly. See http://stackoverflow.com/a/449297/4077294 for more info.
286290

287291
switch (len)
288292
{
@@ -292,14 +296,14 @@ internal unsafe static void Memmove(byte* dest, byte* src, uint len)
292296
*dest = *src;
293297
return;
294298
case 2:
295-
*(short *)dest = *(short *)src;
299+
*(short*)dest = *(short*)src;
296300
return;
297301
case 3:
298-
*(short *)dest = *(short *)src;
302+
*(short*)dest = *(short*)src;
299303
*(dest + 2) = *(src + 2);
300304
return;
301305
case 4:
302-
*(int *)dest = *(int *)src;
306+
*(int*)dest = *(int*)src;
303307
return;
304308
case 5:
305309
*(int*)dest = *(int*)src;
@@ -401,87 +405,186 @@ internal unsafe static void Memmove(byte* dest, byte* src, uint len)
401405
*(int*)(dest + 12) = *(int*)(src + 12);
402406
#endif
403407
return;
404-
default:
405-
break;
408+
case 17:
409+
#if BIT64
410+
*(long*)dest = *(long*)src;
411+
*(long*)(dest + 8) = *(long*)(src + 8);
412+
#else
413+
*(int*)dest = *(int*)src;
414+
*(int*)(dest + 4) = *(int*)(src + 4);
415+
*(int*)(dest + 8) = *(int*)(src + 8);
416+
*(int*)(dest + 12) = *(int*)(src + 12);
417+
#endif
418+
*(dest + 16) = *(src + 16);
419+
return;
420+
case 18:
421+
#if BIT64
422+
*(long*)dest = *(long*)src;
423+
*(long*)(dest + 8) = *(long*)(src + 8);
424+
#else
425+
*(int*)dest = *(int*)src;
426+
*(int*)(dest + 4) = *(int*)(src + 4);
427+
*(int*)(dest + 8) = *(int*)(src + 8);
428+
*(int*)(dest + 12) = *(int*)(src + 12);
429+
#endif
430+
*(short*)(dest + 16) = *(short*)(src + 16);
431+
return;
432+
case 19:
433+
#if BIT64
434+
*(long*)dest = *(long*)src;
435+
*(long*)(dest + 8) = *(long*)(src + 8);
436+
#else
437+
*(int*)dest = *(int*)src;
438+
*(int*)(dest + 4) = *(int*)(src + 4);
439+
*(int*)(dest + 8) = *(int*)(src + 8);
440+
*(int*)(dest + 12) = *(int*)(src + 12);
441+
#endif
442+
*(short*)(dest + 16) = *(short*)(src + 16);
443+
*(dest + 18) = *(src + 18);
444+
return;
445+
case 20:
446+
#if BIT64
447+
*(long*)dest = *(long*)src;
448+
*(long*)(dest + 8) = *(long*)(src + 8);
449+
#else
450+
*(int*)dest = *(int*)src;
451+
*(int*)(dest + 4) = *(int*)(src + 4);
452+
*(int*)(dest + 8) = *(int*)(src + 8);
453+
*(int*)(dest + 12) = *(int*)(src + 12);
454+
#endif
455+
*(int*)(dest + 16) = *(int*)(src + 16);
456+
return;
457+
case 21:
458+
#if BIT64
459+
*(long*)dest = *(long*)src;
460+
*(long*)(dest + 8) = *(long*)(src + 8);
461+
#else
462+
*(int*)dest = *(int*)src;
463+
*(int*)(dest + 4) = *(int*)(src + 4);
464+
*(int*)(dest + 8) = *(int*)(src + 8);
465+
*(int*)(dest + 12) = *(int*)(src + 12);
466+
#endif
467+
*(int*)(dest + 16) = *(int*)(src + 16);
468+
*(dest + 20) = *(src + 20);
469+
return;
470+
case 22:
471+
#if BIT64
472+
*(long*)dest = *(long*)src;
473+
*(long*)(dest + 8) = *(long*)(src + 8);
474+
#else
475+
*(int*)dest = *(int*)src;
476+
*(int*)(dest + 4) = *(int*)(src + 4);
477+
*(int*)(dest + 8) = *(int*)(src + 8);
478+
*(int*)(dest + 12) = *(int*)(src + 12);
479+
#endif
480+
*(int*)(dest + 16) = *(int*)(src + 16);
481+
*(short*)(dest + 20) = *(short*)(src + 20);
482+
return;
406483
}
407484

408485
// P/Invoke into the native version for large lengths
409486
if (len >= 512) goto PInvoke;
410487

488+
nuint i = 0; // byte offset at which we're copying
489+
411490
if (((int)dest & 3) != 0)
412491
{
413492
if (((int)dest & 1) != 0)
414493
{
415-
*dest = *src;
416-
src++;
417-
dest++;
418-
len--;
419-
if (((int)dest & 2) == 0)
420-
goto Aligned;
494+
*(dest + i) = *(src + i);
495+
i += 1;
496+
if (((int)dest & 2) != 0)
497+
goto IntAligned;
421498
}
422-
*(short *)dest = *(short *)src;
423-
src += 2;
424-
dest += 2;
425-
len -= 2;
426-
Aligned: ;
499+
*(short*)(dest + i) = *(short*)(src + i);
500+
i += 2;
427501
}
428502

503+
IntAligned:
504+
429505
#if BIT64
430-
if (((int)dest & 4) != 0)
506+
// On 64-bit IntPtr.Size == 8, so we want to advance to the next 8-aligned address. If
507+
// (int)dest % 8 is 0, 5, 6, or 7, we will already have advanced by 0, 3, 2, or 1
508+
// bytes to the next aligned address (respectively), so do nothing. On the other hand,
509+
// if it is 1, 2, 3, or 4 we will want to copy-and-advance another 4 bytes until
510+
// we're aligned.
511+
// The thing 1, 2, 3, and 4 have in common that the others don't is that if you
512+
// subtract one from them, their 3rd lsb will not be set. Hence, the below check.
513+
514+
if ((((int)dest - 1) & 4) == 0)
431515
{
432-
*(int *)dest = *(int *)src;
433-
src += 4;
434-
dest += 4;
435-
len -= 4;
516+
*(int*)(dest + i) = *(int*)(src + i);
517+
i += 4;
436518
}
437-
#endif
519+
#endif // BIT64
438520

439-
#if BIT64
440-
ulong count = len / 16;
441-
#else
442-
uint count = len / 16;
443-
#endif
444-
while (count > 0)
521+
nuint end = len - 16;
522+
len -= i; // lower 4 bits of len represent how many bytes are left *after* the unrolled loop
523+
524+
// We know due to the above switch-case that this loop will always run 1 iteration; max
525+
// bytes we copy before checking is 23 (7 to align the pointers, 16 for 1 iteration) so
526+
// the switch handles lengths 0-22.
527+
Contract.Assert(end >= 7 && i <= end);
528+
529+
// This is separated out into a different variable, so the i + 16 addition can be
530+
// performed at the start of the pipeline and the loop condition does not have
531+
// a dependency on the writes.
532+
nuint counter;
533+
534+
do
445535
{
536+
counter = i + 16;
537+
538+
// This loop looks very costly since there appear to be a bunch of temporary values
539+
// being created with the adds, but the jit (for x86 anyways) will convert each of
540+
// these to use memory addressing operands.
541+
542+
// So the only cost is a bit of code size, which is made up for by the fact that
543+
// we save on writes to dest/src.
544+
446545
#if BIT64
447-
((long*)dest)[0] = ((long*)src)[0];
448-
((long*)dest)[1] = ((long*)src)[1];
546+
*(long*)(dest + i) = *(long*)(src + i);
547+
*(long*)(dest + i + 8) = *(long*)(src + i + 8);
449548
#else
450-
((int*)dest)[0] = ((int*)src)[0];
451-
((int*)dest)[1] = ((int*)src)[1];
452-
((int*)dest)[2] = ((int*)src)[2];
453-
((int*)dest)[3] = ((int*)src)[3];
549+
*(int*)(dest + i) = *(int*)(src + i);
550+
*(int*)(dest + i + 4) = *(int*)(src + i + 4);
551+
*(int*)(dest + i + 8) = *(int*)(src + i + 8);
552+
*(int*)(dest + i + 12) = *(int*)(src + i + 12);
454553
#endif
455-
dest += 16;
456-
src += 16;
457-
count--;
554+
555+
i = counter;
556+
557+
// See notes above for why this wasn't used instead
558+
// i += 16;
458559
}
560+
while (counter <= end);
459561

460562
if ((len & 8) != 0)
461563
{
462564
#if BIT64
463-
((long*)dest)[0] = ((long*)src)[0];
565+
*(long*)(dest + i) = *(long*)(src + i);
464566
#else
465-
((int*)dest)[0] = ((int*)src)[0];
466-
((int*)dest)[1] = ((int*)src)[1];
567+
*(int*)(dest + i) = *(int*)(src + i);
568+
*(int*)(dest + i + 4) = *(int*)(src + i + 4);
467569
#endif
468-
dest += 8;
469-
src += 8;
470-
}
471-
if ((len & 4) != 0)
472-
{
473-
((int*)dest)[0] = ((int*)src)[0];
474-
dest += 4;
475-
src += 4;
476-
}
477-
if ((len & 2) != 0)
478-
{
479-
((short*)dest)[0] = ((short*)src)[0];
480-
dest += 2;
481-
src += 2;
482-
}
483-
if ((len & 1) != 0)
484-
*dest = *src;
570+
i += 8;
571+
}
572+
if ((len & 4) != 0)
573+
{
574+
*(int*)(dest + i) = *(int*)(src + i);
575+
i += 4;
576+
}
577+
if ((len & 2) != 0)
578+
{
579+
*(short*)(dest + i) = *(short*)(src + i);
580+
i += 2;
581+
}
582+
if ((len & 1) != 0)
583+
{
584+
*(dest + i) = *(src + i);
585+
// We're not using i after this, so not needed
586+
// i += 1;
587+
}
485588

486589
return;
487590

@@ -495,11 +598,7 @@ internal unsafe static void Memmove(byte* dest, byte* src, uint len)
495598
[SecurityCritical]
496599
[ReliabilityContract(Consistency.WillNotCorruptState, Cer.Success)]
497600
[MethodImplAttribute(MethodImplOptions.NoInlining)]
498-
#if BIT64
499-
private unsafe static void _Memmove(byte* dest, byte* src, ulong len)
500-
#else
501-
private unsafe static void _Memmove(byte* dest, byte* src, uint len)
502-
#endif
601+
private unsafe static void _Memmove(byte* dest, byte* src, nuint len)
503602
{
504603
__Memmove(dest, src, len);
505604
}
@@ -508,12 +607,7 @@ private unsafe static void _Memmove(byte* dest, byte* src, uint len)
508607
[SuppressUnmanagedCodeSecurity]
509608
[SecurityCritical]
510609
[ReliabilityContract(Consistency.WillNotCorruptState, Cer.Success)]
511-
#if BIT64
512-
extern private unsafe static void __Memmove(byte* dest, byte* src, ulong len);
513-
#else
514-
extern private unsafe static void __Memmove(byte* dest, byte* src, uint len);
515-
#endif
516-
610+
extern private unsafe static void __Memmove(byte* dest, byte* src, nuint len);
517611

518612
// The attributes on this method are chosen for best JIT performance.
519613
// Please do not edit unless intentional.
@@ -526,11 +620,7 @@ public static unsafe void MemoryCopy(void* source, void* destination, long desti
526620
{
527621
ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.sourceBytesToCopy);
528622
}
529-
#if BIT64
530-
Memmove((byte*)destination, (byte*)source, checked((ulong) sourceBytesToCopy));
531-
#else
532-
Memmove((byte*)destination, (byte*)source, checked((uint)sourceBytesToCopy));
533-
#endif // BIT64
623+
Memmove((byte*)destination, (byte*)source, checked((nuint)sourceBytesToCopy));
534624
}
535625

536626

@@ -547,7 +637,7 @@ public static unsafe void MemoryCopy(void* source, void* destination, ulong dest
547637
}
548638
#if BIT64
549639
Memmove((byte*)destination, (byte*)source, sourceBytesToCopy);
550-
#else
640+
#else // BIT64
551641
Memmove((byte*)destination, (byte*)source, checked((uint)sourceBytesToCopy));
552642
#endif // BIT64
553643
}

0 commit comments

Comments
 (0)