dotnet · imback82 · Jan 17, 2020 · Dec 29, 2019 · Dec 29, 2019 · Dec 29, 2019
diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs
@@ -0,0 +1,67 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Collections.Generic;
+using Microsoft.Spark.ML.Feature;
+using Microsoft.Spark.Sql;
+using Xunit;
+
+namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature
+{
+    [Collection("Spark E2E Tests")]
+    public class BucketizerTests
+    {
+        private readonly SparkSession _spark;
+
+        public BucketizerTests(SparkFixture fixture)
+        {
+            _spark = fixture.Spark;
+        }
+
+        [Fact]
+        public void TestBucketizer()
+        {
+            Bucketizer bucketizer = new Bucketizer("uid")
+                .SetInputCol("input_col")
+                .SetOutputCol("output_col")
+                .SetHandleInvalid("skip")
+                .SetSplits(new[] {Double.MinValue, 0.0, 10.0, 50.0, Double.MaxValue});
+
+            Assert.Equal("skip",
+                bucketizer.GetHandleInvalid());
+
+            Assert.Equal("uid", bucketizer.Uid());
+
+            DataFrame input = _spark.Sql("SELECT ID as input_col from range(100)");
+
+            DataFrame output = bucketizer.Transform(input);
+            Assert.Contains(output.Schema().Fields, (f => f.Name == "output_col"));
+        }
+
+        [Fact]
+        public void TestBucketizer_MultipleColumns()
+        {
+            Bucketizer bucketizer = new Bucketizer()
+                .SetInputCols(new List<string>() {"input_col_a", "input_col_b"})
+                .SetOutputCols(new List<string>() {"output_col_a", "output_col_b"})
+                .SetHandleInvalid("keep")
+                .SetSplitsArray(new[]
+                {
+                    new[] {Double.MinValue, 0.0, 10.0, 50.0, Double.MaxValue},
+                    new[] {Double.MinValue, 0.0, 10000.0, Double.MaxValue}
+                });
+
+            Assert.Equal("keep",
+                bucketizer.GetHandleInvalid());
+
+            DataFrame input =
+                _spark.Sql("SELECT ID as input_col_a, ID as input_col_b from range(100)");
+
+            DataFrame output = bucketizer.Transform(input);
+            Assert.Contains(output.Schema().Fields, (f => f.Name == "output_col_a"));
+            Assert.Contains(output.Schema().Fields, (f => f.Name == "output_col_b"));
+        }
+    }
+}
diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
@@ -0,0 +1,194 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Collections.Generic;
+using Microsoft.Spark.Interop;
+using Microsoft.Spark.Interop.Ipc;
+using Microsoft.Spark.ML.Param;
+using Microsoft.Spark.Sql;
+using Microsoft.Spark.Sql.Types;
+
+namespace Microsoft.Spark.ML.Feature
+{
+    /// <summary>
+    /// <see cref="Bucketizer"/> maps a column of continuous features to a column of feature
+    /// buckets.
+    /// 
+    /// <see cref="Bucketizer"/> can map multiple columns at once by setting the inputCols
+    /// parameter. Note that when both the inputCol and inputCols parameters are set, an Exception
+    /// will be thrown. The splits parameter is only used for single column usage, and splitsArray
+    /// is for multiple columns.
+    /// </summary>
+    public class Bucketizer : IJvmObjectReferenceProvider
+    {
+        internal Bucketizer(JvmObjectReference jvmObject)
+        {
+            _jvmObject = jvmObject;
+        }
+
+        private readonly JvmObjectReference _jvmObject;
+        JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject;
+
+        /// <summary>
+        /// Create a <see cref="Bucketizer"/> without any parameters
+        /// </summary>
+        public Bucketizer()
+        {
+            _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(
+                "org.apache.spark.ml.feature.Bucketizer");
+        }
+
+        /// <summary>
+        /// Create a <see cref="Bucketizer"/> with a UID that is used to give the
+        /// <see cref="Bucketizer"/> a unique ID
+        /// </summary>
+        /// <param name="uid">An immutable unique ID for the object and its derivatives.</param>
+        public Bucketizer(string uid)
+        {
+            _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(
+                "org.apache.spark.ml.feature.Bucketizer", uid);
+        }
+
+        /// <summary>
+        /// Split points for splitting a single column into buckets. To split multiple columns use
+        /// SetSplitsArray. You cannot use both SetSplits and SetSplitsArray at the same time
+        /// </summary>
+        /// <param name="value">
+        /// Split points for mapping continuous features into buckets. With n+1 splits, there are n
+        /// buckets. A bucket defined by splits x,y holds values in the range [x,y) except the last
+        /// bucket, which also includes y. The splits should be of length &gt;= 3 and strictly
+        /// increasing. Values outside the splits specified will be treated as errors.
+        /// </param>
+        /// <returns><see cref="Bucketizer"/></returns>
+        public Bucketizer SetSplits(double[] value)
+        {
+            return WrapAsBucketizer(_jvmObject.Invoke("setSplits", value));
+        }
+
+        /// <summary>
+        /// Split points fot splitting multiple columns into buckets. To split a single column use
+        /// SetSplits. You cannot use both SetSplits and SetSplitsArray at the same time.
+        /// </summary>
+        /// <param name="value">
+        /// The array of split points for mapping continuous features into buckets for multiple 
+        /// columns. For each input column, with n+1 splits, there are n buckets. A bucket defined
+        /// by splits x,y holds values in the range [x,y) except the last bucket, which also
+        /// includes y. The splits should be of length &gt;= 3 and strictly increasing.
+        /// Values outside the splits specified will be treated as errors.</param>
+        /// <returns><see cref="Bucketizer"/></returns>
+        public Bucketizer SetSplitsArray(double[][] value)
+        {
+            DoubleArrayArrayParam doubleArrayArray = new DoubleArrayArrayParam(_jvmObject,
+                "setSplitsArray",
+                "wrapper for double[][] from csharp", value);
+
+            return WrapAsBucketizer(_jvmObject.Invoke("setSplitsArray",
+                doubleArrayArray.ReferenceValue));
+        }
+
+        /// <summary>
+        /// Sets the column that the <see cref="Bucketizer"/> should read from and convert into
+        /// buckets
+        /// </summary>
+        /// <param name="value">The name of the column to as the source of the buckets</param>
+        /// <returns><see cref="Bucketizer"/></returns>
+        public Bucketizer SetInputCol(string value)
+        {
+            return WrapAsBucketizer(_jvmObject.Invoke("setInputCol", value));
+        }
+
+        /// <summary>
+        /// Sets the columns that <see cref="Bucketizer"/> should read from and convert into
+        /// buckets.
+        ///
+        /// Each column is one set of buckets so if you have two input columns you can have two
+        ///  sets of buckets and two output columns.
+        /// </summary>
+        /// <param name="value">List of input columns to use as sources for buckets</param>
+        /// <returns><see cref="Bucketizer"/></returns>
+        public Bucketizer SetInputCols(IEnumerable<string> value)
+        {
+            return WrapAsBucketizer(_jvmObject.Invoke("setInputCols", value));
+        }
+
+        /// <summary>
+        /// The <see cref="Bucketizer"/> will create a new column in the DataFrame, this is the
+        /// name of the new column.
+        /// </summary>
+        /// <param name="value">The name of the new column which contains the bucket ID</param>
+        /// <returns><see cref="Bucketizer"/></returns>
+        public Bucketizer SetOutputCol(string value)
+        {
+            return WrapAsBucketizer(_jvmObject.Invoke("setOutputCol", value));
+        }
+
+        /// <summary>
+        /// The list of columns that the <see cref="Bucketizer"/> will create in the DataFrame.
+        /// </summary>
+        /// <param name="value">List of column names which will contain the bucket ID</param>
+        /// <returns><see cref="Bucketizer"/></returns>
+        public Bucketizer SetOutputCols(List<string> value)
+        {
+            return WrapAsBucketizer(_jvmObject.Invoke("setOutputCols", value));
+        }
+
+        /// <summary>
+        /// Executes the <see cref="Bucketizer"/> and transforms the DataFrame to include the new
+        /// column or columns with the bucketed data.
+        /// </summary>
+        /// <param name="source">The DataFrame to add the bucketed data to</param>
+        /// <returns><see cref="DataFrame"/> containing the original data and the new bucketed
+        /// columns</returns>
+        public DataFrame Transform(DataFrame source)
+        {
+            return new DataFrame((JvmObjectReference)_jvmObject.Invoke("transform", source));
+        }
+
+        /// <summary>
+        /// The reference we get back from each call isn't usable unless we wrap it in a new dotnet
+        /// <see cref="Bucketizer"/>
+        /// </summary>
+        /// <param name="obj">The <see cref="JvmObjectReference"/> to convert into a dotnet
+        /// <see cref="Bucketizer"/></param>
+        /// <returns><see cref="Bucketizer"/></returns>
+        private static Bucketizer WrapAsBucketizer(object obj)
+        {
+            return new Bucketizer((JvmObjectReference)obj);
+        }
+
+        /// <summary>
+        /// The uid that was used to create the <see cref="Bucketizer"/>. If no UID is passed in
+        /// when creating the <see cref="Bucketizer"/> then a random UID is created when the
+        /// <see cref="Bucketizer"/> is created.
+        /// </summary>
+        /// <returns>string UID identifying the <see cref="Bucketizer"/></returns>
+        public string Uid()
+        {
+            return (string)_jvmObject.Invoke("uid");
+        }
+
+        /// <summary>
+        /// How should the <see cref="Bucketizer"/> handle invalid data, choices are "skip",
+        /// "error" or "keep"
+        /// </summary>
+        /// <returns>string showing the way Spark will handle invalid data</returns>
+        public string GetHandleInvalid()
+        {
+            return (string)_jvmObject.Invoke("getHandleInvalid");
+        }
+
+        /// <summary>
+        /// Tells the <see cref="Bucketizer"/> what to do with invalid data.
+        ///
+        /// Choices are "skip", "error" or "keep". Default is "error"
+        /// </summary>
+        /// <param name="value">"skip", "error" or "keep"</param>
+        /// <returns><see cref="Bucketizer"/></returns>
+        public Bucketizer SetHandleInvalid(string value)
+        {
+            return WrapAsBucketizer(_jvmObject.Invoke("setHandleInvalid", value.ToString()));
+        }
+    }
+}
diff --git a/src/csharp/Microsoft.Spark/ML/Param/DoubleArrayArrayParam.cs b/src/csharp/Microsoft.Spark/ML/Param/DoubleArrayArrayParam.cs
@@ -0,0 +1,38 @@
+using System;
+using Microsoft.Spark.Interop;
+using Microsoft.Spark.Interop.Ipc;
+using Newtonsoft.Json;
+
+namespace Microsoft.Spark.ML.Param
+{
+    /// <summary>
+    /// Internal class used to help the `Bucketizer` pass a double[][] into the JVM.
+    /// </summary>
+    class DoubleArrayArrayParam : IJvmObjectReferenceProvider
+    {
+        private readonly JvmObjectReference _jvmObject;
+
+        public DoubleArrayArrayParam(object parent, string name, string doc, double[][] param)
+        {
+            _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(
+                "org.apache.spark.ml.param.DoubleArrayArrayParam",
+                parent, name, doc);
+
+            string json = JsonConvert.SerializeObject(param);
+            ReferenceValue = jsonDecode(json);
+        }
+
+        private JvmObjectReference jsonDecode(string json)
+        {
+            return (JvmObjectReference)_jvmObject.Invoke("jsonDecode", json);
+        }
+        public JvmObjectReference Reference { get; }
+
+        /// <summary>
+        /// This is the JVM version of the double[][] so that it can be used by the `Bucketizer`, to
+        /// get the double[][] across the SerDe this serializes as JSON and used jsonDecode on the
+        /// JVM side to get a double[][]. ReferenceValue is the double[][].
+        /// </summary>
+        public JvmObjectReference ReferenceValue { get; }
+    }
+}