-
Notifications
You must be signed in to change notification settings - Fork 523
Loop unrolled direct string inject #498
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,8 +9,6 @@ namespace Microsoft.AspNet.Server.Kestrel.Infrastructure | |
{ | ||
public static class MemoryPoolIterator2Extensions | ||
{ | ||
private const int _maxStackAllocBytes = 16384; | ||
|
||
private static readonly Encoding _utf8 = Encoding.UTF8; | ||
|
||
public const string HttpConnectMethod = "CONNECT"; | ||
|
@@ -70,102 +68,110 @@ private unsafe static long GetAsciiStringAsLong(string str) | |
} | ||
} | ||
|
||
private static unsafe string GetAsciiStringStack(byte[] input, int inputOffset, int length) | ||
public unsafe static string GetAsciiString(this MemoryPoolIterator2 start, MemoryPoolIterator2 end) | ||
{ | ||
// avoid declaring other local vars, or doing work with stackalloc | ||
// to prevent the .locals init cil flag , see: https://github.com/dotnet/coreclr/issues/1279 | ||
char* output = stackalloc char[length]; | ||
|
||
return GetAsciiStringImplementation(output, input, inputOffset, length); | ||
} | ||
|
||
private static unsafe string GetAsciiStringImplementation(char* output, byte[] input, int inputOffset, int length) | ||
{ | ||
for (var i = 0; i < length; i++) | ||
if (start.IsDefault || end.IsDefault) | ||
{ | ||
output[i] = (char)input[inputOffset + i]; | ||
return null; | ||
} | ||
|
||
return new string(output, 0, length); | ||
} | ||
|
||
private static unsafe string GetAsciiStringStack(MemoryPoolBlock2 start, MemoryPoolIterator2 end, int inputOffset, int length) | ||
{ | ||
// avoid declaring other local vars, or doing work with stackalloc | ||
// to prevent the .locals init cil flag , see: https://github.com/dotnet/coreclr/issues/1279 | ||
char* output = stackalloc char[length]; | ||
|
||
return GetAsciiStringImplementation(output, start, end, inputOffset, length); | ||
} | ||
|
||
private unsafe static string GetAsciiStringHeap(MemoryPoolBlock2 start, MemoryPoolIterator2 end, int inputOffset, int length) | ||
{ | ||
var buffer = new char[length]; | ||
var length = start.GetLength(end); | ||
|
||
fixed (char* output = buffer) | ||
if (length == 0) | ||
{ | ||
return GetAsciiStringImplementation(output, start, end, inputOffset, length); | ||
return null; | ||
} | ||
} | ||
|
||
private static unsafe string GetAsciiStringImplementation(char* output, MemoryPoolBlock2 start, MemoryPoolIterator2 end, int inputOffset, int length) | ||
{ | ||
var outputOffset = 0; | ||
var block = start; | ||
var remaining = length; | ||
// Bytes out of the range of ascii are treated as "opaque data" | ||
// and kept in string as a char value that casts to same input byte value | ||
// https://tools.ietf.org/html/rfc7230#section-3.2.4 | ||
|
||
var endBlock = end.Block; | ||
var endIndex = end.Index; | ||
var inputOffset = start.Index; | ||
var block = start.Block; | ||
|
||
while (true) | ||
var asciiString = new string('\0', length); | ||
|
||
fixed (char* outputStart = asciiString) | ||
{ | ||
int following = (block != endBlock ? block.End : endIndex) - inputOffset; | ||
var output = outputStart; | ||
var remaining = length; | ||
|
||
if (following > 0) | ||
var endBlock = end.Block; | ||
var endIndex = end.Index; | ||
|
||
while (true) | ||
{ | ||
var input = block.Array; | ||
for (var i = 0; i < following; i++) | ||
int following = (block != endBlock ? block.End : endIndex) - inputOffset; | ||
|
||
if (following > 0) | ||
{ | ||
output[i + outputOffset] = (char)input[i + inputOffset]; | ||
fixed (byte* blockStart = block.Array) | ||
{ | ||
var input = blockStart + inputOffset; | ||
var i = 0; | ||
while (i < following - 11) | ||
{ | ||
i += 12; | ||
*(output) = (char)*(input); | ||
*(output + 1) = (char)*(input + 1); | ||
*(output + 2) = (char)*(input + 2); | ||
*(output + 3) = (char)*(input + 3); | ||
*(output + 4) = (char)*(input + 4); | ||
*(output + 5) = (char)*(input + 5); | ||
*(output + 6) = (char)*(input + 6); | ||
*(output + 7) = (char)*(input + 7); | ||
*(output + 8) = (char)*(input + 8); | ||
*(output + 9) = (char)*(input + 9); | ||
*(output + 10) = (char)*(input + 10); | ||
*(output + 11) = (char)*(input + 11); | ||
output += 12; | ||
input += 12; | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I like that this PR removes more code than it adds, but this part now seems slightly more complicated. Are we sure that manual unrolling is faster? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep: #399 (comment) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I remember that now 😄 It's interesting how UnrolledParallelPointer is so consistently better from 1 character all the way to 16384 character strings. It might be interesting to see how something unaligned like a 33 character string performs, since I don't think any of the test cases hit both loops in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah good point, will get that in new tests when doing looking for new numbers. The ParallelPointer works because the cpu can pipeline the instructions as they are independent of each other (i.e. the effects of one line don't effect the results of the next); incrementing the pointer at each step means the next step can't run until that pointer addition is completed as its dependent on the result. Is also one of the problems with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also If I didn't unroll I think this would be a perf regression as the string is initialized; rather than being operations on initialized stack memory; then a memcopy to an uninitialized string; however I think the code being less weird makes up for that... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I definitely like the code being less weird. Interesting point about the instruction pipelining. That explains the somewhat unintuitive results. It's amazing the stuff you have to consider at this level of optimization. Someday maybe the compiler will be smart enough to do this kind of thing for us. I'm going to keep this PR open pending the new numbers. |
||
if (i < following - 5) | ||
{ | ||
i += 6; | ||
*(output) = (char)*(input); | ||
*(output + 1) = (char)*(input + 1); | ||
*(output + 2) = (char)*(input + 2); | ||
*(output + 3) = (char)*(input + 3); | ||
*(output + 4) = (char)*(input + 4); | ||
*(output + 5) = (char)*(input + 5); | ||
output += 6; | ||
input += 6; | ||
} | ||
if (i < following - 3) | ||
{ | ||
i += 4; | ||
*(output) = (char)*(input); | ||
*(output + 1) = (char)*(input + 1); | ||
*(output + 2) = (char)*(input + 2); | ||
*(output + 3) = (char)*(input + 3); | ||
output += 4; | ||
input += 4; | ||
} | ||
while (i < following) | ||
{ | ||
i++; | ||
*output = (char)*input; | ||
output++; | ||
input++; | ||
} | ||
|
||
remaining -= following; | ||
} | ||
} | ||
|
||
remaining -= following; | ||
outputOffset += following; | ||
} | ||
if (remaining == 0) | ||
{ | ||
break; | ||
} | ||
|
||
if (remaining == 0) | ||
{ | ||
return new string(output, 0, length); | ||
block = block.Next; | ||
inputOffset = block.Start; | ||
} | ||
|
||
block = block.Next; | ||
inputOffset = block.Start; | ||
} | ||
} | ||
|
||
public static string GetAsciiString(this MemoryPoolIterator2 start, MemoryPoolIterator2 end) | ||
{ | ||
if (start.IsDefault || end.IsDefault) | ||
{ | ||
return default(string); | ||
} | ||
|
||
var length = start.GetLength(end); | ||
|
||
// Bytes out of the range of ascii are treated as "opaque data" | ||
// and kept in string as a char value that casts to same input byte value | ||
// https://tools.ietf.org/html/rfc7230#section-3.2.4 | ||
if (end.Block == start.Block) | ||
{ | ||
return GetAsciiStringStack(start.Block.Array, start.Index, length); | ||
} | ||
|
||
if (length > _maxStackAllocBytes) | ||
{ | ||
return GetAsciiStringHeap(start.Block, end, start.Index, length); | ||
} | ||
|
||
return GetAsciiStringStack(start.Block, end, start.Index, length); | ||
return asciiString; | ||
} | ||
|
||
public static string GetUtf8String(this MemoryPoolIterator2 start, MemoryPoolIterator2 end) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There is a further combining change that will this fixed block (as buffers are already fixed) - but this stands alone as is.