Skip to content

Commit 4c772c8

Browse files
committed
Merge remote-tracking branch 'origin/main' into 6110_unable_to_filter_by_null
2 parents 655f268 + caee3c2 commit 4c772c8

25 files changed

+540
-216
lines changed

.vsts-dotnet-ci.yml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,20 @@
22
# ML.NET's PR validation build
33
################################################################################
44

5+
pr:
6+
branches:
7+
include:
8+
- main
9+
- feature/*
10+
- release/*
11+
12+
trigger:
13+
branches:
14+
include:
15+
- main
16+
- feature/*
17+
- release/*
18+
519
resources:
620
containers:
721
- container: CentosContainer

build/.night-build.yml

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,8 @@ schedules:
1515
branches:
1616
include:
1717
- main
18-
- releases/1.6.0
19-
- features/automl
20-
- features/integrationPackage
18+
- feature/*
19+
- release/*
2120
always: true
2221

2322
resources:

build/.outer-loop-build.yml

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,8 @@ schedules:
1515
branches:
1616
include:
1717
- main
18-
- releases/1.6.0
19-
- features/automl
20-
- features/integrationPackage
18+
- feature/*
19+
- release/*
2120
always: true
2221

2322

build/codecoverage-ci.yml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,20 @@
22
# ML.NET's Code Coverage validation build
33
################################################################################
44

5+
pr:
6+
branches:
7+
include:
8+
- main
9+
- feature/*
10+
- release/*
11+
12+
trigger:
13+
branches:
14+
include:
15+
- main
16+
- feature/*
17+
- release/*
18+
519
jobs:
620
- template: /build/ci/job-template.yml
721
parameters:

eng/Versions.props

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
<SystemTextJsonVersion>6.0.1</SystemTextJsonVersion>
3131
<SystemThreadingChannelsVersion>4.7.1</SystemThreadingChannelsVersion>
3232
<!-- Other product dependencies -->
33-
<ApacheArrowVersion>2.0.0</ApacheArrowVersion>
33+
<ApacheArrowVersion>11.0.0</ApacheArrowVersion>
3434
<GoogleProtobufVersion>3.19.6</GoogleProtobufVersion>
3535
<LightGBMVersion>2.3.1</LightGBMVersion>
3636
<MicrosoftCodeAnalysisAnalyzersVersion>3.3.0</MicrosoftCodeAnalysisAnalyzersVersion>

src/Microsoft.Data.Analysis/DataFrame.Arrow.cs

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -101,10 +101,18 @@ private static void AppendDataFrameColumnFromArrowArray(Field field, IArrowArray
101101
AppendDataFrameColumnFromArrowArray(fieldsEnumerator.Current, structArrayEnumerator.Current, ret, field.Name + "_");
102102
}
103103
break;
104-
case ArrowTypeId.Decimal:
104+
case ArrowTypeId.Date64:
105+
Date64Array arrowDate64Array = (Date64Array)arrowArray;
106+
dataFrameColumn = new DateTimeDataFrameColumn(fieldName, arrowDate64Array.Data.Length);
107+
for (int i = 0; i < arrowDate64Array.Data.Length; i++)
108+
{
109+
dataFrameColumn[i] = arrowDate64Array.GetDateTime(i);
110+
}
111+
break;
112+
case ArrowTypeId.Decimal128:
113+
case ArrowTypeId.Decimal256:
105114
case ArrowTypeId.Binary:
106115
case ArrowTypeId.Date32:
107-
case ArrowTypeId.Date64:
108116
case ArrowTypeId.Dictionary:
109117
case ArrowTypeId.FixedSizedBinary:
110118
case ArrowTypeId.HalfFloat:
@@ -114,6 +122,7 @@ private static void AppendDataFrameColumnFromArrowArray(Field field, IArrowArray
114122
case ArrowTypeId.Null:
115123
case ArrowTypeId.Time32:
116124
case ArrowTypeId.Time64:
125+
case ArrowTypeId.Timestamp:
117126
default:
118127
throw new NotImplementedException($"{fieldType.Name}");
119128
}
@@ -145,7 +154,7 @@ public static DataFrame FromArrowRecordBatch(RecordBatch recordBatch)
145154
}
146155

147156
/// <summary>
148-
/// Returns an <see cref="IEnumerable{RecordBatch}"/> without copying data
157+
/// Returns an <see cref="IEnumerable{RecordBatch}"/> mostly without copying data
149158
/// </summary>
150159
public IEnumerable<RecordBatch> ToArrowRecordBatches()
151160
{

src/Microsoft.Data.Analysis/DataFrame.IO.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -336,7 +336,7 @@ private static DataFrameColumn CreateColumn(Type kind, string columnName)
336336
}
337337
else if (kind == typeof(DateTime))
338338
{
339-
ret = new PrimitiveDataFrameColumn<DateTime>(columnName);
339+
ret = new DateTimeDataFrameColumn(columnName);
340340
}
341341
else
342342
{

src/Microsoft.Data.Analysis/DataFrame.Join.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ private void SetSuffixForDuplicatedColumnNames(DataFrame dataFrame, DataFrameCol
3030
{
3131
// Pre-existing column. Change name
3232
DataFrameColumn existingColumn = dataFrame.Columns[index];
33-
dataFrame._columnCollection.SetColumnName(existingColumn, existingColumn.Name + leftSuffix);
33+
existingColumn.SetName(existingColumn.Name + leftSuffix);
3434
column.SetName(column.Name + rightSuffix);
3535
index = dataFrame._columnCollection.IndexOf(column.Name);
3636
}

src/Microsoft.Data.Analysis/DataFrame.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -301,7 +301,7 @@ public DataFrame AddPrefix(string prefix, bool inPlace = false)
301301
for (int i = 0; i < df.Columns.Count; i++)
302302
{
303303
DataFrameColumn column = df.Columns[i];
304-
df._columnCollection.SetColumnName(column, prefix + column.Name);
304+
column.SetName(prefix + column.Name);
305305
df.OnColumnsChanged();
306306
}
307307
return df;
@@ -316,7 +316,7 @@ public DataFrame AddSuffix(string suffix, bool inPlace = false)
316316
for (int i = 0; i < df.Columns.Count; i++)
317317
{
318318
DataFrameColumn column = df.Columns[i];
319-
df._columnCollection.SetColumnName(column, column.Name + suffix);
319+
column.SetName(column.Name + suffix);
320320
df.OnColumnsChanged();
321321
}
322322
return df;

src/Microsoft.Data.Analysis/DataFrameBuffer.cs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,10 @@ public void EnsureCapacity(int numberOfValues)
6666

6767
if (newLength > Capacity)
6868
{
69-
var newCapacity = Math.Max(newLength * Size, ReadOnlyBuffer.Length * 2);
69+
//Double buffer size, but not higher than MaxByteCapacity
70+
var doubledSize = (int)Math.Min((long)ReadOnlyBuffer.Length * 2, MaxCapacityInBytes);
71+
var newCapacity = Math.Max(newLength * Size, doubledSize);
72+
7073
var memory = new Memory<byte>(new byte[newCapacity]);
7174
_memory.CopyTo(memory);
7275
_memory = memory;

src/Microsoft.Data.Analysis/DataFrameColumn.cs

Lines changed: 34 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,26 @@ protected set
8484
}
8585
}
8686

87+
// List of ColumnCollections that owns the column
88+
// Current API allows column to be added into multiple dataframes, that's why the list is needed
89+
private readonly List<DataFrameColumnCollection> _ownerColumnCollections = new();
90+
91+
internal void AddOwner(DataFrameColumnCollection columCollection)
92+
{
93+
if (!_ownerColumnCollections.Contains(columCollection))
94+
{
95+
_ownerColumnCollections.Add(columCollection);
96+
}
97+
}
98+
99+
internal void RemoveOwner(DataFrameColumnCollection columCollection)
100+
{
101+
if (_ownerColumnCollections.Contains(columCollection))
102+
{
103+
_ownerColumnCollections.Remove(columCollection);
104+
}
105+
}
106+
87107
/// <summary>
88108
/// The number of <see langword="null" /> values in this column.
89109
/// </summary>
@@ -95,24 +115,30 @@ public abstract long NullCount
95115
private string _name;
96116

97117
/// <summary>
98-
/// The name of this column.
118+
/// The column name.
99119
/// </summary>
100120
public string Name => _name;
101121

102122
/// <summary>
103-
/// Updates the name of this column.
123+
/// Updates the column name.
104124
/// </summary>
105125
/// <param name="newName">The new name.</param>
106-
/// <param name="dataFrame">If passed in, update the column name in <see cref="DataFrame.Columns"/></param>
107-
public void SetName(string newName, DataFrame dataFrame = null)
126+
public void SetName(string newName)
108127
{
109-
if (!(dataFrame is null))
110-
{
111-
dataFrame.Columns.SetColumnName(this, newName);
112-
}
128+
foreach (var owner in _ownerColumnCollections)
129+
owner.UpdateColumnNameMetadata(this, newName);
130+
113131
_name = newName;
114132
}
115133

134+
/// <summary>
135+
/// Updates the name of this column.
136+
/// </summary>
137+
/// <param name="newName">The new name.</param>
138+
/// <param name="dataFrame">Ignored (for backward compatibility)</param>
139+
[Obsolete]
140+
public void SetName(string newName, DataFrame dataFrame) => SetName(newName);
141+
116142
/// <summary>
117143
/// The type of data this column holds.
118144
/// </summary>

src/Microsoft.Data.Analysis/DataFrameColumnCollection.cs

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,11 +38,23 @@ internal IReadOnlyList<string> GetColumnNames()
3838
return ret;
3939
}
4040

41+
public void RenameColumn(string currentName, string newName)
42+
{
43+
var column = this[currentName];
44+
column.SetName(newName);
45+
}
46+
47+
[Obsolete]
4148
public void SetColumnName(DataFrameColumn column, string newName)
49+
{
50+
column.SetName(newName);
51+
}
52+
53+
//Updates column's metadata (is used as a callback from Column class)
54+
internal void UpdateColumnNameMetadata(DataFrameColumn column, string newName)
4255
{
4356
string currentName = column.Name;
4457
int currentIndex = _columnNameToIndexDictionary[currentName];
45-
column.SetName(newName);
4658
_columnNameToIndexDictionary.Remove(currentName);
4759
_columnNameToIndexDictionary.Add(newName, currentIndex);
4860
ColumnsChanged?.Invoke();
@@ -75,6 +87,8 @@ protected override void InsertItem(int columnIndex, DataFrameColumn column)
7587
throw new ArgumentException(string.Format(Strings.DuplicateColumnName, column.Name), nameof(column));
7688
}
7789

90+
column.AddOwner(this);
91+
7892
RowCount = column.Length;
7993

8094
_columnNameToIndexDictionary[column.Name] = columnIndex;
@@ -98,9 +112,13 @@ protected override void SetItem(int columnIndex, DataFrameColumn column)
98112
{
99113
throw new ArgumentException(string.Format(Strings.DuplicateColumnName, column.Name), nameof(column));
100114
}
115+
101116
_columnNameToIndexDictionary.Remove(this[columnIndex].Name);
102117
_columnNameToIndexDictionary[column.Name] = columnIndex;
118+
119+
this[columnIndex].RemoveOwner(this);
103120
base.SetItem(columnIndex, column);
121+
104122
ColumnsChanged?.Invoke();
105123
}
106124

@@ -111,6 +129,8 @@ protected override void RemoveItem(int columnIndex)
111129
{
112130
_columnNameToIndexDictionary[this[i].Name]--;
113131
}
132+
133+
this[columnIndex].RemoveOwner(this);
114134
base.RemoveItem(columnIndex);
115135

116136
//Reset RowCount if the last column was removed and dataframe is empty
@@ -474,6 +494,5 @@ public UInt16DataFrameColumn GetUInt16Column(string name)
474494

475495
throw new ArgumentException(string.Format(Strings.BadColumnCast, column.DataType, typeof(UInt16)));
476496
}
477-
478497
}
479498
}

src/Microsoft.Data.Analysis/PrimitiveColumnContainer.cs

Lines changed: 3 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,9 @@ public void AppendMany(T? value, long count)
181181
}
182182

183183
DataFrameBuffer<T> mutableLastBuffer = Buffers.GetOrCreateMutable(Buffers.Count - 1);
184-
int allocatable = (int)Math.Min(remaining, ReadOnlyDataFrameBuffer<T>.MaxCapacity);
184+
185+
//Calculate how many values we can additionaly allocate and not exceed the MaxCapacity
186+
int allocatable = (int)Math.Min(remaining, ReadOnlyDataFrameBuffer<T>.MaxCapacity - mutableLastBuffer.Length);
185187
mutableLastBuffer.EnsureCapacity(allocatable);
186188

187189
DataFrameBuffer<byte> lastNullBitMapBuffer = NullBitMapBuffers.GetOrCreateMutable(NullBitMapBuffers.Count - 1);
@@ -205,7 +207,6 @@ public void AppendMany(T? value, long count)
205207
_modifyNullCountWhileIndexing = true;
206208
}
207209

208-
209210
remaining -= allocatable;
210211
}
211212
}
@@ -374,18 +375,6 @@ internal int MaxRecordBatchLength(long startIndex)
374375
return Buffers[arrayIndex].Length - (int)startIndex;
375376
}
376377

377-
internal ReadOnlyMemory<byte> GetValueBuffer(long startIndex)
378-
{
379-
int arrayIndex = GetArrayContainingRowIndex(startIndex);
380-
return Buffers[arrayIndex].ReadOnlyBuffer;
381-
}
382-
383-
internal ReadOnlyMemory<byte> GetNullBuffer(long startIndex)
384-
{
385-
int arrayIndex = GetArrayContainingRowIndex(startIndex);
386-
return NullBitMapBuffers[arrayIndex].ReadOnlyBuffer;
387-
}
388-
389378
public IReadOnlyList<T?> this[long startIndex, int length]
390379
{
391380
get

0 commit comments

Comments
 (0)