-
Notifications
You must be signed in to change notification settings - Fork 329
Support for Bucketizer #378
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from 11 commits
Commits
Show all changes
27 commits
Select commit
Hold shift + click to select a range
95d0014
implement bucketizer
fb2d019
first tests
d759e60
multi column tests
160fbf4
Merge branch 'master' into bucketizer-ml-313
GoEddie 97ef668
Update src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
GoEddie 4543974
Update src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
GoEddie fd18cf4
Update src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
GoEddie 64551c9
Update src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
GoEddie fb70f40
tidying
119f14d
Merge branch 'bucketizer-ml-313' of github.com:GoEddie/spark into buc…
9891847
changes after review
d466ea2
Merge branch 'master' of github.com:dotnet/spark into bucketizer-ml-313
633a843
SerDe to handle double[][] for Bucketizer
f4ecbb0
remove DoubleArrayArrayParam
b3d4d0f
SerDe for double[][]
500e7ad
spacing as per other fields
298f4ec
formatting
72d36fd
adding getters to tests
696186c
rollback
33699ea
Apply suggestions from code review
GoEddie 5b80606
Fixing comments after review
e771f86
Merge branch 'master' of github.com:dotnet/spark into bucketizer-ml-313
6c12e6a
fixes after review
3a9b793
Update src/csharp/Microsoft.Spark/Interop/Ipc/JvmBridge.cs
GoEddie 041131f
post-review tidy
9436877
Merge branch 'bucketizer-ml-313' of github.com:GoEddie/spark into buc…
1968f59
var
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
67 changes: 67 additions & 0 deletions
67
src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
// Licensed to the .NET Foundation under one or more agreements. | ||
// The .NET Foundation licenses this file to you under the MIT license. | ||
// See the LICENSE file in the project root for more information. | ||
|
||
using System; | ||
using System.Collections.Generic; | ||
using Microsoft.Spark.ML.Feature; | ||
using Microsoft.Spark.Sql; | ||
using Xunit; | ||
|
||
namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature | ||
{ | ||
[Collection("Spark E2E Tests")] | ||
public class BucketizerTests | ||
{ | ||
private readonly SparkSession _spark; | ||
|
||
public BucketizerTests(SparkFixture fixture) | ||
{ | ||
_spark = fixture.Spark; | ||
} | ||
|
||
[Fact] | ||
public void TestBucketizer() | ||
{ | ||
Bucketizer bucketizer = new Bucketizer("uid") | ||
.SetInputCol("input_col") | ||
.SetOutputCol("output_col") | ||
.SetHandleInvalid("skip") | ||
.SetSplits(new[] {Double.MinValue, 0.0, 10.0, 50.0, Double.MaxValue}); | ||
|
||
Assert.Equal("skip", | ||
bucketizer.GetHandleInvalid()); | ||
|
||
Assert.Equal("uid", bucketizer.Uid()); | ||
|
||
DataFrame input = _spark.Sql("SELECT ID as input_col from range(100)"); | ||
|
||
DataFrame output = bucketizer.Transform(input); | ||
Assert.Contains(output.Schema().Fields, (f => f.Name == "output_col")); | ||
} | ||
|
||
[Fact] | ||
public void TestBucketizer_MultipleColumns() | ||
{ | ||
Bucketizer bucketizer = new Bucketizer() | ||
GoEddie marked this conversation as resolved.
Show resolved
Hide resolved
|
||
.SetInputCols(new List<string>() {"input_col_a", "input_col_b"}) | ||
.SetOutputCols(new List<string>() {"output_col_a", "output_col_b"}) | ||
.SetHandleInvalid("keep") | ||
.SetSplitsArray(new[] | ||
{ | ||
new[] {Double.MinValue, 0.0, 10.0, 50.0, Double.MaxValue}, | ||
new[] {Double.MinValue, 0.0, 10000.0, Double.MaxValue} | ||
}); | ||
|
||
Assert.Equal("keep", | ||
bucketizer.GetHandleInvalid()); | ||
|
||
DataFrame input = | ||
_spark.Sql("SELECT ID as input_col_a, ID as input_col_b from range(100)"); | ||
|
||
DataFrame output = bucketizer.Transform(input); | ||
Assert.Contains(output.Schema().Fields, (f => f.Name == "output_col_a")); | ||
Assert.Contains(output.Schema().Fields, (f => f.Name == "output_col_b")); | ||
} | ||
} | ||
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,194 @@ | ||
// Licensed to the .NET Foundation under one or more agreements. | ||
// The .NET Foundation licenses this file to you under the MIT license. | ||
// See the LICENSE file in the project root for more information. | ||
|
||
using System; | ||
using System.Collections.Generic; | ||
using Microsoft.Spark.Interop; | ||
using Microsoft.Spark.Interop.Ipc; | ||
using Microsoft.Spark.ML.Param; | ||
using Microsoft.Spark.Sql; | ||
using Microsoft.Spark.Sql.Types; | ||
|
||
namespace Microsoft.Spark.ML.Feature | ||
{ | ||
/// <summary> | ||
/// <see cref="Bucketizer"/> maps a column of continuous features to a column of feature | ||
/// buckets. | ||
/// | ||
/// <see cref="Bucketizer"/> can map multiple columns at once by setting the inputCols | ||
/// parameter. Note that when both the inputCol and inputCols parameters are set, an Exception | ||
/// will be thrown. The splits parameter is only used for single column usage, and splitsArray | ||
/// is for multiple columns. | ||
/// </summary> | ||
public class Bucketizer : IJvmObjectReferenceProvider | ||
{ | ||
internal Bucketizer(JvmObjectReference jvmObject) | ||
GoEddie marked this conversation as resolved.
Show resolved
Hide resolved
|
||
{ | ||
_jvmObject = jvmObject; | ||
} | ||
|
||
private readonly JvmObjectReference _jvmObject; | ||
JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; | ||
|
||
/// <summary> | ||
/// Create a <see cref="Bucketizer"/> without any parameters | ||
/// </summary> | ||
public Bucketizer() | ||
{ | ||
_jvmObject = SparkEnvironment.JvmBridge.CallConstructor( | ||
"org.apache.spark.ml.feature.Bucketizer"); | ||
} | ||
|
||
/// <summary> | ||
/// Create a <see cref="Bucketizer"/> with a UID that is used to give the | ||
/// <see cref="Bucketizer"/> a unique ID | ||
/// </summary> | ||
/// <param name="uid">An immutable unique ID for the object and its derivatives.</param> | ||
public Bucketizer(string uid) | ||
GoEddie marked this conversation as resolved.
Show resolved
Hide resolved
|
||
{ | ||
_jvmObject = SparkEnvironment.JvmBridge.CallConstructor( | ||
"org.apache.spark.ml.feature.Bucketizer", uid); | ||
} | ||
|
||
/// <summary> | ||
/// Split points for splitting a single column into buckets. To split multiple columns use | ||
/// SetSplitsArray. You cannot use both SetSplits and SetSplitsArray at the same time | ||
/// </summary> | ||
/// <param name="value"> | ||
/// Split points for mapping continuous features into buckets. With n+1 splits, there are n | ||
/// buckets. A bucket defined by splits x,y holds values in the range [x,y) except the last | ||
/// bucket, which also includes y. The splits should be of length >= 3 and strictly | ||
/// increasing. Values outside the splits specified will be treated as errors. | ||
/// </param> | ||
/// <returns><see cref="Bucketizer"/></returns> | ||
public Bucketizer SetSplits(double[] value) | ||
{ | ||
return WrapAsBucketizer(_jvmObject.Invoke("setSplits", value)); | ||
} | ||
|
||
/// <summary> | ||
/// Split points fot splitting multiple columns into buckets. To split a single column use | ||
/// SetSplits. You cannot use both SetSplits and SetSplitsArray at the same time. | ||
/// </summary> | ||
/// <param name="value"> | ||
/// The array of split points for mapping continuous features into buckets for multiple | ||
/// columns. For each input column, with n+1 splits, there are n buckets. A bucket defined | ||
/// by splits x,y holds values in the range [x,y) except the last bucket, which also | ||
/// includes y. The splits should be of length >= 3 and strictly increasing. | ||
/// Values outside the splits specified will be treated as errors.</param> | ||
/// <returns><see cref="Bucketizer"/></returns> | ||
public Bucketizer SetSplitsArray(double[][] value) | ||
{ | ||
DoubleArrayArrayParam doubleArrayArray = new DoubleArrayArrayParam(_jvmObject, | ||
"setSplitsArray", | ||
"wrapper for double[][] from csharp", value); | ||
|
||
return WrapAsBucketizer(_jvmObject.Invoke("setSplitsArray", | ||
doubleArrayArray.ReferenceValue)); | ||
} | ||
|
||
/// <summary> | ||
/// Sets the column that the <see cref="Bucketizer"/> should read from and convert into | ||
/// buckets | ||
/// </summary> | ||
/// <param name="value">The name of the column to as the source of the buckets</param> | ||
/// <returns><see cref="Bucketizer"/></returns> | ||
public Bucketizer SetInputCol(string value) | ||
{ | ||
return WrapAsBucketizer(_jvmObject.Invoke("setInputCol", value)); | ||
} | ||
|
||
/// <summary> | ||
/// Sets the columns that <see cref="Bucketizer"/> should read from and convert into | ||
/// buckets. | ||
/// | ||
/// Each column is one set of buckets so if you have two input columns you can have two | ||
/// sets of buckets and two output columns. | ||
GoEddie marked this conversation as resolved.
Show resolved
Hide resolved
|
||
/// </summary> | ||
/// <param name="value">List of input columns to use as sources for buckets</param> | ||
/// <returns><see cref="Bucketizer"/></returns> | ||
public Bucketizer SetInputCols(IEnumerable<string> value) | ||
{ | ||
return WrapAsBucketizer(_jvmObject.Invoke("setInputCols", value)); | ||
} | ||
|
||
/// <summary> | ||
/// The <see cref="Bucketizer"/> will create a new column in the DataFrame, this is the | ||
/// name of the new column. | ||
/// </summary> | ||
/// <param name="value">The name of the new column which contains the bucket ID</param> | ||
/// <returns><see cref="Bucketizer"/></returns> | ||
public Bucketizer SetOutputCol(string value) | ||
{ | ||
return WrapAsBucketizer(_jvmObject.Invoke("setOutputCol", value)); | ||
} | ||
|
||
/// <summary> | ||
/// The list of columns that the <see cref="Bucketizer"/> will create in the DataFrame. | ||
/// </summary> | ||
/// <param name="value">List of column names which will contain the bucket ID</param> | ||
/// <returns><see cref="Bucketizer"/></returns> | ||
public Bucketizer SetOutputCols(List<string> value) | ||
{ | ||
return WrapAsBucketizer(_jvmObject.Invoke("setOutputCols", value)); | ||
} | ||
|
||
/// <summary> | ||
/// Executes the <see cref="Bucketizer"/> and transforms the DataFrame to include the new | ||
/// column or columns with the bucketed data. | ||
/// </summary> | ||
/// <param name="source">The DataFrame to add the bucketed data to</param> | ||
/// <returns><see cref="DataFrame"/> containing the original data and the new bucketed | ||
/// columns</returns> | ||
public DataFrame Transform(DataFrame source) | ||
{ | ||
return new DataFrame((JvmObjectReference)_jvmObject.Invoke("transform", source)); | ||
} | ||
|
||
/// <summary> | ||
/// The reference we get back from each call isn't usable unless we wrap it in a new dotnet | ||
/// <see cref="Bucketizer"/> | ||
/// </summary> | ||
/// <param name="obj">The <see cref="JvmObjectReference"/> to convert into a dotnet | ||
/// <see cref="Bucketizer"/></param> | ||
/// <returns><see cref="Bucketizer"/></returns> | ||
private static Bucketizer WrapAsBucketizer(object obj) | ||
{ | ||
return new Bucketizer((JvmObjectReference)obj); | ||
} | ||
|
||
/// <summary> | ||
/// The uid that was used to create the <see cref="Bucketizer"/>. If no UID is passed in | ||
/// when creating the <see cref="Bucketizer"/> then a random UID is created when the | ||
/// <see cref="Bucketizer"/> is created. | ||
/// </summary> | ||
/// <returns>string UID identifying the <see cref="Bucketizer"/></returns> | ||
public string Uid() | ||
{ | ||
return (string)_jvmObject.Invoke("uid"); | ||
} | ||
|
||
/// <summary> | ||
/// How should the <see cref="Bucketizer"/> handle invalid data, choices are "skip", | ||
/// "error" or "keep" | ||
/// </summary> | ||
/// <returns>string showing the way Spark will handle invalid data</returns> | ||
public string GetHandleInvalid() | ||
{ | ||
return (string)_jvmObject.Invoke("getHandleInvalid"); | ||
} | ||
|
||
/// <summary> | ||
/// Tells the <see cref="Bucketizer"/> what to do with invalid data. | ||
/// | ||
/// Choices are "skip", "error" or "keep". Default is "error" | ||
/// </summary> | ||
/// <param name="value">"skip", "error" or "keep"</param> | ||
/// <returns><see cref="Bucketizer"/></returns> | ||
public Bucketizer SetHandleInvalid(string value) | ||
{ | ||
return WrapAsBucketizer(_jvmObject.Invoke("setHandleInvalid", value.ToString())); | ||
} | ||
} | ||
} |
38 changes: 38 additions & 0 deletions
38
src/csharp/Microsoft.Spark/ML/Param/DoubleArrayArrayParam.cs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
using System; | ||
using Microsoft.Spark.Interop; | ||
using Microsoft.Spark.Interop.Ipc; | ||
using Newtonsoft.Json; | ||
|
||
namespace Microsoft.Spark.ML.Param | ||
{ | ||
/// <summary> | ||
/// Internal class used to help the `Bucketizer` pass a double[][] into the JVM. | ||
/// </summary> | ||
class DoubleArrayArrayParam : IJvmObjectReferenceProvider | ||
{ | ||
private readonly JvmObjectReference _jvmObject; | ||
|
||
public DoubleArrayArrayParam(object parent, string name, string doc, double[][] param) | ||
{ | ||
_jvmObject = SparkEnvironment.JvmBridge.CallConstructor( | ||
"org.apache.spark.ml.param.DoubleArrayArrayParam", | ||
parent, name, doc); | ||
|
||
string json = JsonConvert.SerializeObject(param); | ||
ReferenceValue = jsonDecode(json); | ||
} | ||
|
||
private JvmObjectReference jsonDecode(string json) | ||
{ | ||
return (JvmObjectReference)_jvmObject.Invoke("jsonDecode", json); | ||
} | ||
public JvmObjectReference Reference { get; } | ||
|
||
/// <summary> | ||
/// This is the JVM version of the double[][] so that it can be used by the `Bucketizer`, to | ||
/// get the double[][] across the SerDe this serializes as JSON and used jsonDecode on the | ||
/// JVM side to get a double[][]. ReferenceValue is the double[][]. | ||
/// </summary> | ||
public JvmObjectReference ReferenceValue { get; } | ||
} | ||
GoEddie marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} |
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.