Skip to content

Support for Bucketizer #378

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 27 commits into from
Jan 17, 2020
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
95d0014
implement bucketizer
Dec 29, 2019
fb2d019
first tests
Dec 29, 2019
d759e60
multi column tests
Dec 29, 2019
160fbf4
Merge branch 'master' into bucketizer-ml-313
GoEddie Jan 7, 2020
97ef668
Update src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
GoEddie Jan 8, 2020
4543974
Update src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
GoEddie Jan 8, 2020
fd18cf4
Update src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
GoEddie Jan 8, 2020
64551c9
Update src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
GoEddie Jan 8, 2020
fb70f40
tidying
Jan 8, 2020
119f14d
Merge branch 'bucketizer-ml-313' of github.com:GoEddie/spark into buc…
Jan 8, 2020
9891847
changes after review
Jan 8, 2020
d466ea2
Merge branch 'master' of github.com:dotnet/spark into bucketizer-ml-313
Jan 13, 2020
633a843
SerDe to handle double[][] for Bucketizer
Jan 13, 2020
f4ecbb0
remove DoubleArrayArrayParam
Jan 13, 2020
b3d4d0f
SerDe for double[][]
Jan 13, 2020
500e7ad
spacing as per other fields
Jan 13, 2020
298f4ec
formatting
Jan 13, 2020
72d36fd
adding getters to tests
Jan 13, 2020
696186c
rollback
Jan 13, 2020
33699ea
Apply suggestions from code review
GoEddie Jan 15, 2020
5b80606
Fixing comments after review
Jan 15, 2020
e771f86
Merge branch 'master' of github.com:dotnet/spark into bucketizer-ml-313
Jan 15, 2020
6c12e6a
fixes after review
Jan 15, 2020
3a9b793
Update src/csharp/Microsoft.Spark/Interop/Ipc/JvmBridge.cs
GoEddie Jan 15, 2020
041131f
post-review tidy
Jan 17, 2020
9436877
Merge branch 'bucketizer-ml-313' of github.com:GoEddie/spark into buc…
Jan 17, 2020
1968f59
var
Jan 17, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System;
using System.Collections.Generic;
using Microsoft.Spark.ML.Feature;
using Microsoft.Spark.Sql;
using Xunit;

namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature
{
[Collection("Spark E2E Tests")]
public class BucketizerTests
{
private readonly SparkSession _spark;

public BucketizerTests(SparkFixture fixture)
{
_spark = fixture.Spark;
}

[Fact]
public void TestBucketizer()
{
Bucketizer bucketizer = new Bucketizer("uid")
.SetInputCol("input_col")
.SetOutputCol("output_col")
.SetHandleInvalid("skip")
.SetSplits(new[] {Double.MinValue, 0.0, 10.0, 50.0, Double.MaxValue});

Assert.Equal("skip",
bucketizer.GetHandleInvalid());

Assert.Equal("uid", bucketizer.Uid());

DataFrame input = _spark.Sql("SELECT ID as input_col from range(100)");

DataFrame output = bucketizer.Transform(input);
Assert.Contains(output.Schema().Fields, (f => f.Name == "output_col"));
}

[Fact]
public void TestBucketizer_MultipleColumns()
{
Bucketizer bucketizer = new Bucketizer()
.SetInputCols(new List<string>() {"input_col_a", "input_col_b"})
.SetOutputCols(new List<string>() {"output_col_a", "output_col_b"})
.SetHandleInvalid("keep")
.SetSplitsArray(new[]
{
new[] {Double.MinValue, 0.0, 10.0, 50.0, Double.MaxValue},
new[] {Double.MinValue, 0.0, 10000.0, Double.MaxValue}
});

Assert.Equal("keep",
bucketizer.GetHandleInvalid());

DataFrame input =
_spark.Sql("SELECT ID as input_col_a, ID as input_col_b from range(100)");

DataFrame output = bucketizer.Transform(input);
Assert.Contains(output.Schema().Fields, (f => f.Name == "output_col_a"));
Assert.Contains(output.Schema().Fields, (f => f.Name == "output_col_b"));
}
}
}
194 changes: 194 additions & 0 deletions src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System;
using System.Collections.Generic;
using Microsoft.Spark.Interop;
using Microsoft.Spark.Interop.Ipc;
using Microsoft.Spark.ML.Param;
using Microsoft.Spark.Sql;
using Microsoft.Spark.Sql.Types;

namespace Microsoft.Spark.ML.Feature
{
/// <summary>
/// <see cref="Bucketizer"/> maps a column of continuous features to a column of feature
/// buckets.
///
/// <see cref="Bucketizer"/> can map multiple columns at once by setting the inputCols
/// parameter. Note that when both the inputCol and inputCols parameters are set, an Exception
/// will be thrown. The splits parameter is only used for single column usage, and splitsArray
/// is for multiple columns.
/// </summary>
public class Bucketizer : IJvmObjectReferenceProvider
{
internal Bucketizer(JvmObjectReference jvmObject)
{
_jvmObject = jvmObject;
}

private readonly JvmObjectReference _jvmObject;
JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject;

/// <summary>
/// Create a <see cref="Bucketizer"/> without any parameters
/// </summary>
public Bucketizer()
{
_jvmObject = SparkEnvironment.JvmBridge.CallConstructor(
"org.apache.spark.ml.feature.Bucketizer");
}

/// <summary>
/// Create a <see cref="Bucketizer"/> with a UID that is used to give the
/// <see cref="Bucketizer"/> a unique ID
/// </summary>
/// <param name="uid">An immutable unique ID for the object and its derivatives.</param>
public Bucketizer(string uid)
{
_jvmObject = SparkEnvironment.JvmBridge.CallConstructor(
"org.apache.spark.ml.feature.Bucketizer", uid);
}

/// <summary>
/// Split points for splitting a single column into buckets. To split multiple columns use
/// SetSplitsArray. You cannot use both SetSplits and SetSplitsArray at the same time
/// </summary>
/// <param name="value">
/// Split points for mapping continuous features into buckets. With n+1 splits, there are n
/// buckets. A bucket defined by splits x,y holds values in the range [x,y) except the last
/// bucket, which also includes y. The splits should be of length &gt;= 3 and strictly
/// increasing. Values outside the splits specified will be treated as errors.
/// </param>
/// <returns><see cref="Bucketizer"/></returns>
public Bucketizer SetSplits(double[] value)
{
return WrapAsBucketizer(_jvmObject.Invoke("setSplits", value));
}

/// <summary>
/// Split points fot splitting multiple columns into buckets. To split a single column use
/// SetSplits. You cannot use both SetSplits and SetSplitsArray at the same time.
/// </summary>
/// <param name="value">
/// The array of split points for mapping continuous features into buckets for multiple
/// columns. For each input column, with n+1 splits, there are n buckets. A bucket defined
/// by splits x,y holds values in the range [x,y) except the last bucket, which also
/// includes y. The splits should be of length &gt;= 3 and strictly increasing.
/// Values outside the splits specified will be treated as errors.</param>
/// <returns><see cref="Bucketizer"/></returns>
public Bucketizer SetSplitsArray(double[][] value)
{
DoubleArrayArrayParam doubleArrayArray = new DoubleArrayArrayParam(_jvmObject,
"setSplitsArray",
"wrapper for double[][] from csharp", value);

return WrapAsBucketizer(_jvmObject.Invoke("setSplitsArray",
doubleArrayArray.ReferenceValue));
}

/// <summary>
/// Sets the column that the <see cref="Bucketizer"/> should read from and convert into
/// buckets
/// </summary>
/// <param name="value">The name of the column to as the source of the buckets</param>
/// <returns><see cref="Bucketizer"/></returns>
public Bucketizer SetInputCol(string value)
{
return WrapAsBucketizer(_jvmObject.Invoke("setInputCol", value));
}

/// <summary>
/// Sets the columns that <see cref="Bucketizer"/> should read from and convert into
/// buckets.
///
/// Each column is one set of buckets so if you have two input columns you can have two
/// sets of buckets and two output columns.
/// </summary>
/// <param name="value">List of input columns to use as sources for buckets</param>
/// <returns><see cref="Bucketizer"/></returns>
public Bucketizer SetInputCols(IEnumerable<string> value)
{
return WrapAsBucketizer(_jvmObject.Invoke("setInputCols", value));
}

/// <summary>
/// The <see cref="Bucketizer"/> will create a new column in the DataFrame, this is the
/// name of the new column.
/// </summary>
/// <param name="value">The name of the new column which contains the bucket ID</param>
/// <returns><see cref="Bucketizer"/></returns>
public Bucketizer SetOutputCol(string value)
{
return WrapAsBucketizer(_jvmObject.Invoke("setOutputCol", value));
}

/// <summary>
/// The list of columns that the <see cref="Bucketizer"/> will create in the DataFrame.
/// </summary>
/// <param name="value">List of column names which will contain the bucket ID</param>
/// <returns><see cref="Bucketizer"/></returns>
public Bucketizer SetOutputCols(List<string> value)
{
return WrapAsBucketizer(_jvmObject.Invoke("setOutputCols", value));
}

/// <summary>
/// Executes the <see cref="Bucketizer"/> and transforms the DataFrame to include the new
/// column or columns with the bucketed data.
/// </summary>
/// <param name="source">The DataFrame to add the bucketed data to</param>
/// <returns><see cref="DataFrame"/> containing the original data and the new bucketed
/// columns</returns>
public DataFrame Transform(DataFrame source)
{
return new DataFrame((JvmObjectReference)_jvmObject.Invoke("transform", source));
}

/// <summary>
/// The reference we get back from each call isn't usable unless we wrap it in a new dotnet
/// <see cref="Bucketizer"/>
/// </summary>
/// <param name="obj">The <see cref="JvmObjectReference"/> to convert into a dotnet
/// <see cref="Bucketizer"/></param>
/// <returns><see cref="Bucketizer"/></returns>
private static Bucketizer WrapAsBucketizer(object obj)
{
return new Bucketizer((JvmObjectReference)obj);
}

/// <summary>
/// The uid that was used to create the <see cref="Bucketizer"/>. If no UID is passed in
/// when creating the <see cref="Bucketizer"/> then a random UID is created when the
/// <see cref="Bucketizer"/> is created.
/// </summary>
/// <returns>string UID identifying the <see cref="Bucketizer"/></returns>
public string Uid()
{
return (string)_jvmObject.Invoke("uid");
}

/// <summary>
/// How should the <see cref="Bucketizer"/> handle invalid data, choices are "skip",
/// "error" or "keep"
/// </summary>
/// <returns>string showing the way Spark will handle invalid data</returns>
public string GetHandleInvalid()
{
return (string)_jvmObject.Invoke("getHandleInvalid");
}

/// <summary>
/// Tells the <see cref="Bucketizer"/> what to do with invalid data.
///
/// Choices are "skip", "error" or "keep". Default is "error"
/// </summary>
/// <param name="value">"skip", "error" or "keep"</param>
/// <returns><see cref="Bucketizer"/></returns>
public Bucketizer SetHandleInvalid(string value)
{
return WrapAsBucketizer(_jvmObject.Invoke("setHandleInvalid", value.ToString()));
}
}
}
38 changes: 38 additions & 0 deletions src/csharp/Microsoft.Spark/ML/Param/DoubleArrayArrayParam.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
using System;
using Microsoft.Spark.Interop;
using Microsoft.Spark.Interop.Ipc;
using Newtonsoft.Json;

namespace Microsoft.Spark.ML.Param
{
/// <summary>
/// Internal class used to help the `Bucketizer` pass a double[][] into the JVM.
/// </summary>
class DoubleArrayArrayParam : IJvmObjectReferenceProvider
{
private readonly JvmObjectReference _jvmObject;

public DoubleArrayArrayParam(object parent, string name, string doc, double[][] param)
{
_jvmObject = SparkEnvironment.JvmBridge.CallConstructor(
"org.apache.spark.ml.param.DoubleArrayArrayParam",
parent, name, doc);

string json = JsonConvert.SerializeObject(param);
ReferenceValue = jsonDecode(json);
}

private JvmObjectReference jsonDecode(string json)
{
return (JvmObjectReference)_jvmObject.Invoke("jsonDecode", json);
}
public JvmObjectReference Reference { get; }

/// <summary>
/// This is the JVM version of the double[][] so that it can be used by the `Bucketizer`, to
/// get the double[][] across the SerDe this serializes as JSON and used jsonDecode on the
/// JVM side to get a double[][]. ReferenceValue is the double[][].
/// </summary>
public JvmObjectReference ReferenceValue { get; }
}
}