Adding support for Pivot with values (#642)

Niharikadutta · web-flow · commit 60648316551b · 2020-08-27T15:55:42.000-07:00
diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameTests.cs
@@ -680,10 +680,15 @@ public void TestSignaturesV2_4_X()
 
             {
                 RelationalGroupedDataset df = _df.GroupBy("name");
+                var values = new List<object> { 19, "twenty" };
 
                 Assert.IsType<RelationalGroupedDataset>(df.Pivot("age"));
 
                 Assert.IsType<RelationalGroupedDataset>(df.Pivot(Col("age")));
+
+                Assert.IsType<RelationalGroupedDataset>(df.Pivot("age", values));
+
+                Assert.IsType<RelationalGroupedDataset>(df.Pivot(Col("age"), values));
             }
         }
     }
diff --git a/src/csharp/Microsoft.Spark/Interop/Ipc/PayloadHelper.cs b/src/csharp/Microsoft.Spark/Interop/Ipc/PayloadHelper.cs
@@ -31,6 +31,7 @@ internal class PayloadHelper
         private static readonly byte[] s_arrayTypeId = new[] { (byte)'l' };
         private static readonly byte[] s_dictionaryTypeId = new[] { (byte)'e' };
         private static readonly byte[] s_rowArrTypeId = new[] { (byte)'R' };
+        private static readonly byte[] s_objectArrTypeId = new[] { (byte)'O' };
 
         private static readonly ConcurrentDictionary<Type, bool> s_isDictionaryTable =
             new ConcurrentDictionary<Type, bool>();
@@ -218,6 +219,26 @@ internal static void ConvertArgsToBytes(
                                 destination.Position = posAfterEnumerable;
                                 break;
 
+                            case IEnumerable<object> argObjectEnumerable:
+                                posBeforeEnumerable = destination.Position;
+                                destination.Position += sizeof(int);
+                                itemCount = 0;
+                                if (convertArgs == null)
+                                {
+                                    convertArgs = new object[1];
+                                }
+                                foreach (object o in argObjectEnumerable)
+                                {
+                                    ++itemCount;
+                                    convertArgs[0] = o;
+                                    ConvertArgsToBytes(destination, convertArgs, true);
+                                }
+                                posAfterEnumerable = destination.Position;
+                                destination.Position = posBeforeEnumerable;
+                                SerDe.Write(destination, itemCount);
+                                destination.Position = posAfterEnumerable;
+                                break;
+
                             case var _ when IsDictionary(arg.GetType()):
                                 // Generic dictionary, but we don't have it strongly typed as
                                 // Dictionary<T,U>
@@ -333,6 +354,11 @@ internal static byte[] GetTypeId(Type type)
                         return s_rowArrTypeId;
                     }
 
+                    if (typeof(IEnumerable<object>).IsAssignableFrom(type))
+                    {
+                        return s_objectArrTypeId;
+                    }
+
                     if (typeof(Date).IsAssignableFrom(type))
                     {
                         return s_dateTypeId;
diff --git a/src/csharp/Microsoft.Spark/Sql/RelationalGroupedDataset.cs b/src/csharp/Microsoft.Spark/Sql/RelationalGroupedDataset.cs
@@ -94,6 +94,17 @@ public RelationalGroupedDataset Pivot(string pivotColumn) =>
             new RelationalGroupedDataset(
                 (JvmObjectReference)_jvmObject.Invoke("pivot", pivotColumn), _dataFrame);
 
+        /// <summary>
+        /// Pivots a column of the current DataFrame and performs the specified aggregation.
+        /// </summary>
+        /// <param name="pivotColumn">Name of the column to pivot of type string</param>
+        /// <param name="values">List of values that will be translated to columns in the
+        /// output DataFrame.</param>
+        /// <returns>New RelationalGroupedDataset object with pivot applied</returns>
+        public RelationalGroupedDataset Pivot(string pivotColumn, IEnumerable<object> values) =>
+            new RelationalGroupedDataset(
+                (JvmObjectReference)_jvmObject.Invoke("pivot", pivotColumn, values), _dataFrame);
+
         /// <summary>
         /// Pivots a column of the current DataFrame and performs the specified aggregation.
         /// </summary>
@@ -103,6 +114,17 @@ public RelationalGroupedDataset Pivot(Column pivotColumn) =>
             new RelationalGroupedDataset(
                 (JvmObjectReference)_jvmObject.Invoke("pivot", pivotColumn), _dataFrame);
 
+        /// <summary>
+        /// Pivots a column of the current DataFrame and performs the specified aggregation.
+        /// </summary>
+        /// <param name="pivotColumn">The column to pivot of type <see cref="Column"/></param>
+        /// <param name="values">List of values that will be translated to columns in the
+        /// output DataFrame.</param>
+        /// <returns>New RelationalGroupedDataset object with pivot applied</returns>
+        public RelationalGroupedDataset Pivot(Column pivotColumn, IEnumerable<object> values) =>
+            new RelationalGroupedDataset(
+                (JvmObjectReference)_jvmObject.Invoke("pivot", pivotColumn, values), _dataFrame);
+
         internal DataFrame Apply(StructType returnType, Func<FxDataFrame, FxDataFrame> func)
         {
             DataFrameGroupedMapWorkerFunction.ExecuteDelegate wrapper =
diff --git a/src/scala/microsoft-spark-2.3.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala b/src/scala/microsoft-spark-2.3.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala
@@ -42,6 +42,7 @@ object SerDe {
       case 't' => readTime(dis)
       case 'j' => JVMObjectTracker.getObject(readString(dis))
       case 'R' => readRowArr(dis)
+      case 'O' => readObjectArr(dis)
       case _ => throw new IllegalArgumentException(s"Invalid type $dataType")
     }
   }
@@ -138,6 +139,11 @@ object SerDe {
     (0 until len).map(_ => readRow(in)).toList.asJava
   }
 
+  def readObjectArr(in: DataInputStream): Seq[Any] = {
+    val len = readInt(in)
+    (0 until len).map(_ => readObject(in))
+  }
+
   def readList(dis: DataInputStream): Array[_] = {
     val arrType = readObjectType(dis)
     arrType match {
diff --git a/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala b/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala
@@ -42,6 +42,7 @@ object SerDe {
       case 't' => readTime(dis)
       case 'j' => JVMObjectTracker.getObject(readString(dis))
       case 'R' => readRowArr(dis)
+      case 'O' => readObjectArr(dis)
       case _ => throw new IllegalArgumentException(s"Invalid type $dataType")
     }
   }
@@ -138,6 +139,11 @@ object SerDe {
     (0 until len).map(_ => readRow(in)).toList.asJava
   }
 
+  def readObjectArr(in: DataInputStream): Seq[Any] = {
+    val len = readInt(in)
+    (0 until len).map(_ => readObject(in))
+  }
+
   def readList(dis: DataInputStream): Array[_] = {
     val arrType = readObjectType(dis)
     arrType match {
diff --git a/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala b/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala
@@ -42,6 +42,7 @@ object SerDe {
       case 't' => readTime(dis)
       case 'j' => JVMObjectTracker.getObject(readString(dis))
       case 'R' => readRowArr(dis)
+      case 'O' => readObjectArr(dis)
       case _ => throw new IllegalArgumentException(s"Invalid type $dataType")
     }
   }
@@ -138,6 +139,11 @@ object SerDe {
     (0 until len).map(_ => readRow(in)).toList.asJava
   }
 
+  def readObjectArr(in: DataInputStream): Seq[Any] = {
+    val len = readInt(in)
+    (0 until len).map(_ => readObject(in))
+  }
+
   def readList(dis: DataInputStream): Array[_] = {
     val arrType = readObjectType(dis)
     arrType match {

Original file line number	Diff line number	Diff line change
`@@ -680,10 +680,15 @@ public void TestSignaturesV2_4_X()`
`680`	`680`
`681`	`681`	`{`
`682`	`682`	`RelationalGroupedDataset df = _df.GroupBy("name");`
	`683`	`+ var values = new List<object> { 19, "twenty" };`
`683`	`684`
`684`	`685`	`Assert.IsType<RelationalGroupedDataset>(df.Pivot("age"));`
`685`	`686`
`686`	`687`	`Assert.IsType<RelationalGroupedDataset>(df.Pivot(Col("age")));`
	`688`	`+`
	`689`	`+ Assert.IsType<RelationalGroupedDataset>(df.Pivot("age", values));`
	`690`	`+`
	`691`	`+ Assert.IsType<RelationalGroupedDataset>(df.Pivot(Col("age"), values));`
`687`	`692`	`}`
`688`	`693`	`}`
`689`	`694`	`}`
Original file line number	Diff line number	Diff line change
`@@ -42,6 +42,7 @@ object SerDe {`
`42`	`42`	`case 't' => readTime(dis)`
`43`	`43`	`case 'j' => JVMObjectTracker.getObject(readString(dis))`
`44`	`44`	`case 'R' => readRowArr(dis)`
	`45`	`+ case 'O' => readObjectArr(dis)`
`45`	`46`	`case _ => throw new IllegalArgumentException(s"Invalid type $dataType")`
`46`	`47`	`}`
`47`	`48`	`}`
`@@ -138,6 +139,11 @@ object SerDe {`
`138`	`139`	`(0 until len).map(_ => readRow(in)).toList.asJava`
`139`	`140`	`}`
`140`	`141`
	`142`	`+ def readObjectArr(in: DataInputStream): Seq[Any] = {`
	`143`	`+ val len = readInt(in)`
	`144`	`+ (0 until len).map(_ => readObject(in))`
	`145`	`+ }`
	`146`	`+`
`141`	`147`	`def readList(dis: DataInputStream): Array[_] = {`
`142`	`148`	`val arrType = readObjectType(dis)`
`143`	`149`	`arrType match {`