Skip to content

Commit 5648c89

Browse files
authored
Improve performance of column cloning inside DataFrame arithmetics (#6814)
* Optimize PrimitiveColumnContainer.Clone method * Avoid unnecessary type conversion during binary operations * Remove using * Fix DataFrameBuffer constructor * remove uncorrectly added using * Make DataFrameBuffer Length field protected * Fix typo * Use RawSpan
1 parent 15e6a55 commit 5648c89

8 files changed

+103
-101
lines changed

src/Microsoft.Data.Analysis/ArrowStringDataFrameColumn.cs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -213,9 +213,9 @@ private void Append(ReadOnlySpan<byte> value)
213213
_offsetsBuffers.Add(mutableOffsetsBuffer);
214214
mutableOffsetsBuffer.Append(0);
215215
}
216-
mutableDataBuffer.EnsureCapacity(value.Length);
217-
value.CopyTo(mutableDataBuffer.RawSpan.Slice(mutableDataBuffer.Length));
218-
mutableDataBuffer.Length += value.Length;
216+
var startIndex = mutableDataBuffer.Length;
217+
mutableDataBuffer.IncreaseSize(value.Length);
218+
value.CopyTo(mutableDataBuffer.RawSpan.Slice(startIndex));
219219
mutableOffsetsBuffer.Append(mutableOffsetsBuffer[mutableOffsetsBuffer.Length - 1] + value.Length);
220220
}
221221
SetValidityBit(Length - 1, value != default);

src/Microsoft.Data.Analysis/DataFrameBuffer.cs

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ namespace Microsoft.Data.Analysis
1515
internal class DataFrameBuffer<T> : ReadOnlyDataFrameBuffer<T>
1616
where T : unmanaged
1717
{
18+
private const int MinCapacity = 8;
19+
1820
private Memory<byte> _memory;
1921

2022
public override ReadOnlyMemory<byte> ReadOnlyBuffer => _memory;
@@ -36,24 +38,35 @@ public Span<T> RawSpan
3638
get => MemoryMarshal.Cast<byte, T>(Buffer.Span);
3739
}
3840

39-
public DataFrameBuffer(int numberOfValues = 8) : base(numberOfValues) { }
41+
public DataFrameBuffer(int capacity = 0)
42+
{
43+
if ((long)capacity * Size > MaxCapacity)
44+
{
45+
throw new ArgumentException($"{capacity} exceeds buffer capacity", nameof(capacity));
46+
}
47+
48+
_memory = new byte[Math.Max(capacity, MinCapacity)];
49+
}
4050

41-
internal DataFrameBuffer(ReadOnlyMemory<byte> buffer, int length) : base(buffer, length)
51+
internal DataFrameBuffer(ReadOnlyMemory<byte> buffer, int length)
4252
{
4353
_memory = new byte[buffer.Length];
4454
buffer.CopyTo(_memory);
55+
Length = length;
4556
}
4657

4758
public void Append(T value)
4859
{
49-
if (Length == MaxCapacity)
50-
{
51-
throw new ArgumentException("Current buffer is full", nameof(value));
52-
}
5360
EnsureCapacity(1);
54-
if (Length < MaxCapacity)
55-
++Length;
56-
Span[Length - 1] = value;
61+
62+
RawSpan[Length] = value;
63+
Length++;
64+
}
65+
66+
public void IncreaseSize(int numberOfValues)
67+
{
68+
EnsureCapacity(numberOfValues);
69+
Length += numberOfValues;
5770
}
5871

5972
public void EnsureCapacity(int numberOfValues)

src/Microsoft.Data.Analysis/PrimitiveColumnContainer.cs

Lines changed: 20 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -67,9 +67,8 @@ public PrimitiveColumnContainer(ReadOnlyMemory<byte> buffer, ReadOnlyMemory<byte
6767
ReadOnlyDataFrameBuffer<T> dataBuffer;
6868
if (buffer.IsEmpty)
6969
{
70-
DataFrameBuffer<T> mutableBuffer = new DataFrameBuffer<T>();
71-
mutableBuffer.EnsureCapacity(length);
72-
mutableBuffer.Length = length;
70+
DataFrameBuffer<T> mutableBuffer = new DataFrameBuffer<T>(length);
71+
mutableBuffer.IncreaseSize(length);
7372
mutableBuffer.RawSpan.Fill(default(T));
7473
dataBuffer = mutableBuffer;
7574
}
@@ -172,15 +171,12 @@ public void AppendMany(T? value, long count)
172171

173172
//Calculate how many values we can additionaly allocate and not exceed the MaxCapacity
174173
int allocatable = (int)Math.Min(remaining, ReadOnlyDataFrameBuffer<T>.MaxCapacity - mutableLastBuffer.Length);
175-
mutableLastBuffer.EnsureCapacity(allocatable);
174+
mutableLastBuffer.IncreaseSize(allocatable);
176175

177176
DataFrameBuffer<byte> lastNullBitMapBuffer = NullBitMapBuffers.GetOrCreateMutable(NullBitMapBuffers.Count - 1);
178177
int nullBufferAllocatable = (allocatable + 7) / 8;
179-
lastNullBitMapBuffer.EnsureCapacity(nullBufferAllocatable);
178+
lastNullBitMapBuffer.IncreaseSize(nullBufferAllocatable);
180179

181-
182-
mutableLastBuffer.Length += allocatable;
183-
lastNullBitMapBuffer.Length += nullBufferAllocatable;
184180
Length += allocatable;
185181

186182
if (value.HasValue)
@@ -436,13 +432,8 @@ private List<ReadOnlyDataFrameBuffer<byte>> CloneNullBitMapBuffers()
436432
List<ReadOnlyDataFrameBuffer<byte>> ret = new List<ReadOnlyDataFrameBuffer<byte>>();
437433
foreach (ReadOnlyDataFrameBuffer<byte> buffer in NullBitMapBuffers)
438434
{
439-
DataFrameBuffer<byte> newBuffer = new DataFrameBuffer<byte>();
435+
DataFrameBuffer<byte> newBuffer = new DataFrameBuffer<byte>(buffer.ReadOnlyBuffer, buffer.Length);
440436
ret.Add(newBuffer);
441-
ReadOnlySpan<byte> span = buffer.ReadOnlySpan;
442-
for (int i = 0; i < span.Length; i++)
443-
{
444-
newBuffer.Append(span[i]);
445-
}
446437
}
447438
return ret;
448439
}
@@ -518,14 +509,9 @@ public PrimitiveColumnContainer<T> Clone()
518509
var ret = new PrimitiveColumnContainer<T>();
519510
foreach (ReadOnlyDataFrameBuffer<T> buffer in Buffers)
520511
{
521-
DataFrameBuffer<T> newBuffer = new DataFrameBuffer<T>();
512+
DataFrameBuffer<T> newBuffer = new DataFrameBuffer<T>(buffer.ReadOnlyBuffer, buffer.Length);
522513
ret.Buffers.Add(newBuffer);
523-
ReadOnlySpan<T> span = buffer.ReadOnlySpan;
524514
ret.Length += buffer.Length;
525-
for (int i = 0; i < span.Length; i++)
526-
{
527-
newBuffer.Append(span[i]);
528-
}
529515
}
530516
ret.NullBitMapBuffers = CloneNullBitMapBuffers();
531517
ret.NullCount = NullCount;
@@ -537,9 +523,10 @@ internal PrimitiveColumnContainer<bool> CloneAsBoolContainer()
537523
var ret = new PrimitiveColumnContainer<bool>();
538524
foreach (ReadOnlyDataFrameBuffer<T> buffer in Buffers)
539525
{
540-
DataFrameBuffer<bool> newBuffer = new DataFrameBuffer<bool>();
526+
DataFrameBuffer<bool> newBuffer = new DataFrameBuffer<bool>(buffer.Length);
541527
ret.Buffers.Add(newBuffer);
542-
newBuffer.EnsureCapacity(buffer.Length);
528+
newBuffer.IncreaseSize(buffer.Length);
529+
543530
if (typeof(T) == typeof(bool))
544531
{
545532
var localBuffer = buffer;
@@ -550,7 +537,6 @@ internal PrimitiveColumnContainer<bool> CloneAsBoolContainer()
550537
{
551538
newBuffer.Span.Fill(false);
552539
}
553-
newBuffer.Length = buffer.Length;
554540
ret.Length += buffer.Length;
555541
}
556542
ret.NullBitMapBuffers = CloneNullBitMapBuffers();
@@ -564,9 +550,8 @@ internal PrimitiveColumnContainer<byte> CloneAsByteContainer()
564550
foreach (ReadOnlyDataFrameBuffer<T> buffer in Buffers)
565551
{
566552
ret.Length += buffer.Length;
567-
DataFrameBuffer<byte> newBuffer = new DataFrameBuffer<byte>();
553+
DataFrameBuffer<byte> newBuffer = new DataFrameBuffer<byte>(buffer.Length);
568554
ret.Buffers.Add(newBuffer);
569-
newBuffer.EnsureCapacity(buffer.Length);
570555
ReadOnlySpan<T> span = buffer.ReadOnlySpan;
571556
for (int i = 0; i < span.Length; i++)
572557
{
@@ -584,9 +569,8 @@ internal PrimitiveColumnContainer<sbyte> CloneAsSByteContainer()
584569
foreach (ReadOnlyDataFrameBuffer<T> buffer in Buffers)
585570
{
586571
ret.Length += buffer.Length;
587-
DataFrameBuffer<sbyte> newBuffer = new DataFrameBuffer<sbyte>();
572+
DataFrameBuffer<sbyte> newBuffer = new DataFrameBuffer<sbyte>(buffer.Length);
588573
ret.Buffers.Add(newBuffer);
589-
newBuffer.EnsureCapacity(buffer.Length);
590574
ReadOnlySpan<T> span = buffer.ReadOnlySpan;
591575
for (int i = 0; i < span.Length; i++)
592576
{
@@ -604,9 +588,8 @@ internal PrimitiveColumnContainer<double> CloneAsDoubleContainer()
604588
foreach (ReadOnlyDataFrameBuffer<T> buffer in Buffers)
605589
{
606590
ret.Length += buffer.Length;
607-
DataFrameBuffer<double> newBuffer = new DataFrameBuffer<double>();
591+
DataFrameBuffer<double> newBuffer = new DataFrameBuffer<double>(buffer.Length);
608592
ret.Buffers.Add(newBuffer);
609-
newBuffer.EnsureCapacity(buffer.Length);
610593
ReadOnlySpan<T> span = buffer.ReadOnlySpan;
611594
for (int i = 0; i < span.Length; i++)
612595
{
@@ -624,9 +607,8 @@ internal PrimitiveColumnContainer<decimal> CloneAsDecimalContainer()
624607
foreach (ReadOnlyDataFrameBuffer<T> buffer in Buffers)
625608
{
626609
ret.Length += buffer.Length;
627-
DataFrameBuffer<decimal> newBuffer = new DataFrameBuffer<decimal>();
610+
DataFrameBuffer<decimal> newBuffer = new DataFrameBuffer<decimal>(buffer.Length);
628611
ret.Buffers.Add(newBuffer);
629-
newBuffer.EnsureCapacity(buffer.Length);
630612
ReadOnlySpan<T> span = buffer.ReadOnlySpan;
631613
for (int i = 0; i < span.Length; i++)
632614
{
@@ -644,9 +626,8 @@ internal PrimitiveColumnContainer<short> CloneAsShortContainer()
644626
foreach (ReadOnlyDataFrameBuffer<T> buffer in Buffers)
645627
{
646628
ret.Length += buffer.Length;
647-
DataFrameBuffer<short> newBuffer = new DataFrameBuffer<short>();
629+
DataFrameBuffer<short> newBuffer = new DataFrameBuffer<short>(buffer.Length);
648630
ret.Buffers.Add(newBuffer);
649-
newBuffer.EnsureCapacity(buffer.Length);
650631
ReadOnlySpan<T> span = buffer.ReadOnlySpan;
651632
for (int i = 0; i < span.Length; i++)
652633
{
@@ -664,9 +645,8 @@ internal PrimitiveColumnContainer<ushort> CloneAsUShortContainer()
664645
foreach (ReadOnlyDataFrameBuffer<T> buffer in Buffers)
665646
{
666647
ret.Length += buffer.Length;
667-
DataFrameBuffer<ushort> newBuffer = new DataFrameBuffer<ushort>();
648+
DataFrameBuffer<ushort> newBuffer = new DataFrameBuffer<ushort>(buffer.Length);
668649
ret.Buffers.Add(newBuffer);
669-
newBuffer.EnsureCapacity(buffer.Length);
670650
ReadOnlySpan<T> span = buffer.ReadOnlySpan;
671651
for (int i = 0; i < span.Length; i++)
672652
{
@@ -684,9 +664,8 @@ internal PrimitiveColumnContainer<int> CloneAsIntContainer()
684664
foreach (ReadOnlyDataFrameBuffer<T> buffer in Buffers)
685665
{
686666
ret.Length += buffer.Length;
687-
DataFrameBuffer<int> newBuffer = new DataFrameBuffer<int>();
667+
DataFrameBuffer<int> newBuffer = new DataFrameBuffer<int>(buffer.Length);
688668
ret.Buffers.Add(newBuffer);
689-
newBuffer.EnsureCapacity(buffer.Length);
690669
ReadOnlySpan<T> span = buffer.ReadOnlySpan;
691670
for (int i = 0; i < span.Length; i++)
692671
{
@@ -704,9 +683,8 @@ internal PrimitiveColumnContainer<uint> CloneAsUIntContainer()
704683
foreach (ReadOnlyDataFrameBuffer<T> buffer in Buffers)
705684
{
706685
ret.Length += buffer.Length;
707-
DataFrameBuffer<uint> newBuffer = new DataFrameBuffer<uint>();
686+
DataFrameBuffer<uint> newBuffer = new DataFrameBuffer<uint>(buffer.Length);
708687
ret.Buffers.Add(newBuffer);
709-
newBuffer.EnsureCapacity(buffer.Length);
710688
ReadOnlySpan<T> span = buffer.ReadOnlySpan;
711689
for (int i = 0; i < span.Length; i++)
712690
{
@@ -724,9 +702,8 @@ internal PrimitiveColumnContainer<long> CloneAsLongContainer()
724702
foreach (ReadOnlyDataFrameBuffer<T> buffer in Buffers)
725703
{
726704
ret.Length += buffer.Length;
727-
DataFrameBuffer<long> newBuffer = new DataFrameBuffer<long>();
705+
DataFrameBuffer<long> newBuffer = new DataFrameBuffer<long>(buffer.Length);
728706
ret.Buffers.Add(newBuffer);
729-
newBuffer.EnsureCapacity(buffer.Length);
730707
ReadOnlySpan<T> span = buffer.ReadOnlySpan;
731708
for (int i = 0; i < span.Length; i++)
732709
{
@@ -744,9 +721,8 @@ internal PrimitiveColumnContainer<ulong> CloneAsULongContainer()
744721
foreach (ReadOnlyDataFrameBuffer<T> buffer in Buffers)
745722
{
746723
ret.Length += buffer.Length;
747-
DataFrameBuffer<ulong> newBuffer = new DataFrameBuffer<ulong>();
724+
DataFrameBuffer<ulong> newBuffer = new DataFrameBuffer<ulong>(buffer.Length);
748725
ret.Buffers.Add(newBuffer);
749-
newBuffer.EnsureCapacity(buffer.Length);
750726
ReadOnlySpan<T> span = buffer.ReadOnlySpan;
751727
for (int i = 0; i < span.Length; i++)
752728
{
@@ -764,9 +740,8 @@ internal PrimitiveColumnContainer<float> CloneAsFloatContainer()
764740
foreach (ReadOnlyDataFrameBuffer<T> buffer in Buffers)
765741
{
766742
ret.Length += buffer.Length;
767-
DataFrameBuffer<float> newBuffer = new DataFrameBuffer<float>();
743+
DataFrameBuffer<float> newBuffer = new DataFrameBuffer<float>(buffer.Length);
768744
ret.Buffers.Add(newBuffer);
769-
newBuffer.EnsureCapacity(buffer.Length);
770745
ReadOnlySpan<T> span = buffer.ReadOnlySpan;
771746
for (int i = 0; i < span.Length; i++)
772747
{

0 commit comments

Comments
 (0)