Skip to content
This repository was archived by the owner on Dec 18, 2018. It is now read-only.

Loop unrolled direct string inject #498

Merged
merged 1 commit into from
Jan 6, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@ namespace Microsoft.AspNet.Server.Kestrel.Infrastructure
{
public static class MemoryPoolIterator2Extensions
{
private const int _maxStackAllocBytes = 16384;

private static readonly Encoding _utf8 = Encoding.UTF8;

public const string HttpConnectMethod = "CONNECT";
Expand Down Expand Up @@ -70,102 +68,110 @@ private unsafe static long GetAsciiStringAsLong(string str)
}
}

private static unsafe string GetAsciiStringStack(byte[] input, int inputOffset, int length)
public unsafe static string GetAsciiString(this MemoryPoolIterator2 start, MemoryPoolIterator2 end)
{
// avoid declaring other local vars, or doing work with stackalloc
// to prevent the .locals init cil flag , see: https://github.com/dotnet/coreclr/issues/1279
char* output = stackalloc char[length];

return GetAsciiStringImplementation(output, input, inputOffset, length);
}

private static unsafe string GetAsciiStringImplementation(char* output, byte[] input, int inputOffset, int length)
{
for (var i = 0; i < length; i++)
if (start.IsDefault || end.IsDefault)
{
output[i] = (char)input[inputOffset + i];
return null;
}

return new string(output, 0, length);
}

private static unsafe string GetAsciiStringStack(MemoryPoolBlock2 start, MemoryPoolIterator2 end, int inputOffset, int length)
{
// avoid declaring other local vars, or doing work with stackalloc
// to prevent the .locals init cil flag , see: https://github.com/dotnet/coreclr/issues/1279
char* output = stackalloc char[length];

return GetAsciiStringImplementation(output, start, end, inputOffset, length);
}

private unsafe static string GetAsciiStringHeap(MemoryPoolBlock2 start, MemoryPoolIterator2 end, int inputOffset, int length)
{
var buffer = new char[length];
var length = start.GetLength(end);

fixed (char* output = buffer)
if (length == 0)
{
return GetAsciiStringImplementation(output, start, end, inputOffset, length);
return null;
}
}

private static unsafe string GetAsciiStringImplementation(char* output, MemoryPoolBlock2 start, MemoryPoolIterator2 end, int inputOffset, int length)
{
var outputOffset = 0;
var block = start;
var remaining = length;
// Bytes out of the range of ascii are treated as "opaque data"
// and kept in string as a char value that casts to same input byte value
// https://tools.ietf.org/html/rfc7230#section-3.2.4

var endBlock = end.Block;
var endIndex = end.Index;
var inputOffset = start.Index;
var block = start.Block;

while (true)
var asciiString = new string('\0', length);

fixed (char* outputStart = asciiString)
{
int following = (block != endBlock ? block.End : endIndex) - inputOffset;
var output = outputStart;
var remaining = length;

if (following > 0)
var endBlock = end.Block;
var endIndex = end.Index;

while (true)
{
var input = block.Array;
for (var i = 0; i < following; i++)
int following = (block != endBlock ? block.End : endIndex) - inputOffset;

if (following > 0)
{
output[i + outputOffset] = (char)input[i + inputOffset];
fixed (byte* blockStart = block.Array)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is a further combining change that will this fixed block (as buffers are already fixed) - but this stands alone as is.

{
var input = blockStart + inputOffset;
var i = 0;
while (i < following - 11)
{
i += 12;
*(output) = (char)*(input);
*(output + 1) = (char)*(input + 1);
*(output + 2) = (char)*(input + 2);
*(output + 3) = (char)*(input + 3);
*(output + 4) = (char)*(input + 4);
*(output + 5) = (char)*(input + 5);
*(output + 6) = (char)*(input + 6);
*(output + 7) = (char)*(input + 7);
*(output + 8) = (char)*(input + 8);
*(output + 9) = (char)*(input + 9);
*(output + 10) = (char)*(input + 10);
*(output + 11) = (char)*(input + 11);
output += 12;
input += 12;
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like that this PR removes more code than it adds, but this part now seems slightly more complicated. Are we sure that manual unrolling is faster?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep: #399 (comment)
Non-unrolled (SequentialPointer)
Incrementing value on each unroll step (UnrolledPointer)
Adding to value on each step; and incrementing it at end (UnrolledParallelPointer)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I remember that now 😄 It's interesting how UnrolledParallelPointer is so consistently better from 1 character all the way to 16384 character strings.

It might be interesting to see how something unaligned like a 33 character string performs, since I don't think any of the test cases hit both loops in GetAsciiStringImplementationB (UnrolledParallelPointer).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah good point, will get that in new tests when doing looking for new numbers.

The ParallelPointer works because the cpu can pipeline the instructions as they are independent of each other (i.e. the effects of one line don't effect the results of the next); incrementing the pointer at each step means the next step can't run until that pointer addition is completed as its dependent on the result. Is also one of the problems with i++ in a regular loop; the next i++ can't be run until that has completed as it is dependent of its result.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also If I didn't unroll I think this would be a perf regression as the string is initialized; rather than being operations on initialized stack memory; then a memcopy to an uninitialized string; however I think the code being less weird makes up for that...

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I definitely like the code being less weird. Interesting point about the instruction pipelining. That explains the somewhat unintuitive results. It's amazing the stuff you have to consider at this level of optimization. Someday maybe the compiler will be smart enough to do this kind of thing for us.

I'm going to keep this PR open pending the new numbers.

if (i < following - 5)
{
i += 6;
*(output) = (char)*(input);
*(output + 1) = (char)*(input + 1);
*(output + 2) = (char)*(input + 2);
*(output + 3) = (char)*(input + 3);
*(output + 4) = (char)*(input + 4);
*(output + 5) = (char)*(input + 5);
output += 6;
input += 6;
}
if (i < following - 3)
{
i += 4;
*(output) = (char)*(input);
*(output + 1) = (char)*(input + 1);
*(output + 2) = (char)*(input + 2);
*(output + 3) = (char)*(input + 3);
output += 4;
input += 4;
}
while (i < following)
{
i++;
*output = (char)*input;
output++;
input++;
}

remaining -= following;
}
}

remaining -= following;
outputOffset += following;
}
if (remaining == 0)
{
break;
}

if (remaining == 0)
{
return new string(output, 0, length);
block = block.Next;
inputOffset = block.Start;
}

block = block.Next;
inputOffset = block.Start;
}
}

public static string GetAsciiString(this MemoryPoolIterator2 start, MemoryPoolIterator2 end)
{
if (start.IsDefault || end.IsDefault)
{
return default(string);
}

var length = start.GetLength(end);

// Bytes out of the range of ascii are treated as "opaque data"
// and kept in string as a char value that casts to same input byte value
// https://tools.ietf.org/html/rfc7230#section-3.2.4
if (end.Block == start.Block)
{
return GetAsciiStringStack(start.Block.Array, start.Index, length);
}

if (length > _maxStackAllocBytes)
{
return GetAsciiStringHeap(start.Block, end, start.Index, length);
}

return GetAsciiStringStack(start.Block, end, start.Index, length);
return asciiString;
}

public static string GetUtf8String(this MemoryPoolIterator2 start, MemoryPoolIterator2 end)
Expand Down
4 changes: 2 additions & 2 deletions test/Microsoft.AspNet.Server.KestrelTests/AsciiDecoder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ public class AsciiDecoderTests
[Fact]
private void FullByteRangeSupported()
{
var byteRange = Enumerable.Range(0, 255).Select(x => (byte)x).ToArray();
var byteRange = Enumerable.Range(0, 256).Select(x => (byte)x).ToArray();

var mem = MemoryPoolBlock2.Create(new ArraySegment<byte>(byteRange), IntPtr.Zero, null, null);
mem.End = byteRange.Length;
Expand Down Expand Up @@ -74,7 +74,7 @@ private void MultiBlockProducesCorrectResults()
}

[Fact]
private void HeapAllocationProducesCorrectResults()
private void LargeAllocationProducesCorrectResults()
{
var byteRange = Enumerable.Range(0, 16384 + 64).Select(x => (byte)x).ToArray();
var expectedByteRange = byteRange.Concat(byteRange).ToArray();
Expand Down