apache · mgaido91 · Sep 9, 2019 · Sep 9, 2019 · Sep 11, 2019 · Sep 12, 2019
diff --git a/docs/sql-migration-guide-upgrade.md b/docs/sql-migration-guide-upgrade.md
@@ -7,6 +7,12 @@ displayTitle: Spark SQL Upgrading Guide
 * Table of contents
 {:toc}
 
+## Upgrading from Spark SQL 2.4 to 2.4.5
+
+ - Starting from 2.4.5, SQL configurations are effective also when a Dataset is converted to an RDD and its
+   plan is executed due to action on the derived RDD. The previous buggy behavior can be restored setting
+   `spark.sql.legacy.rdd.applyConf` to `false`.
+
 ## Upgrading from Spark SQL 2.4 to 2.4.1
 
   - The value of `spark.executor.heartbeatInterval`, when specified without units like "30" rather than "30s", was

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -1298,6 +1298,14 @@ object SQLConf {
       .booleanConf
       .createWithDefault(true)
 
+  val USE_CONF_ON_RDD_OPERATION =
+    buildConf("spark.sql.legacy.rdd.applyConf")
+      .internal()
+      .doc("When false, SQL configurations are disregarded when operations on a RDD derived from" +
+        " a dataframe are executed. This is the (buggy) behavior up to 2.4.3.")
+      .booleanConf
+      .createWithDefault(true)
+
   val REPLACE_EXCEPT_WITH_FILTER = buildConf("spark.sql.optimizer.replaceExceptWithFilter")
     .internal()
     .doc("When true, the apply function of the rule verifies whether the right node of the" +

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
@@ -29,6 +29,7 @@ import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.execution.command.{DescribeTableCommand, ExecutedCommandExec, ShowTablesCommand}
 import org.apache.spark.sql.execution.exchange.{EnsureRequirements, ReuseExchange}
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.{BinaryType, DateType, DecimalType, TimestampType, _}
 import org.apache.spark.util.Utils
 
@@ -77,8 +78,13 @@ class QueryExecution(val sparkSession: SparkSession, val logical: LogicalPlan) {
   lazy val executedPlan: SparkPlan = prepareForExecution(sparkPlan)
 
   /** Internal version of the RDD. Avoids copies and has no schema */
-  lazy val toRdd: RDD[InternalRow] = new SQLExecutionRDD(
-    executedPlan.execute(), sparkSession.sessionState.conf)
+  lazy val toRdd: RDD[InternalRow] = {
+    if (sparkSession.sessionState.conf.getConf(SQLConf.USE_CONF_ON_RDD_OPERATION)) {
+      new SQLExecutionRDD(executedPlan.execute(), sparkSession.sessionState.conf)
+    } else {
+      executedPlan.execute()
+    }
+  }
 
   /**
    * Prepares a planned [[SparkPlan]] for execution by inserting shuffle operations and internal

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecutionRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecutionRDD.scala
@@ -16,10 +16,6 @@
  */
 package org.apache.spark.sql.execution
 
-import java.util.Properties
-
-import scala.collection.JavaConverters._
-
 import org.apache.spark.{Partition, TaskContext}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
@@ -38,10 +34,8 @@ class SQLExecutionRDD(
     var sqlRDD: RDD[InternalRow], conf: SQLConf) extends RDD[InternalRow](sqlRDD) {
   private val sqlConfigs = conf.getAllConfs
   private lazy val sqlConfExecutorSide = {
-    val props = new Properties()
-    props.putAll(sqlConfigs.asJava)
     val newConf = new SQLConf()
-    newConf.setConf(props)
+    sqlConfigs.foreach { case (k, v) => newConf.setConfString(k, v) }
     newConf
   }
 

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/internal/ExecutorSideSQLConfSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/internal/ExecutorSideSQLConfSuite.scala
@@ -105,17 +105,29 @@ class ExecutorSideSQLConfSuite extends SparkFunSuite with SQLTestUtils {
   }
 
   test("SPARK-28939: propagate SQLConf also in conversions to RDD") {
-    val confs = Seq("spark.sql.a" -> "x", "spark.sql.b" -> "y")
-    val physicalPlan = SQLConfAssertPlan(confs)
-    val dummyQueryExecution = FakeQueryExecution(spark, physicalPlan)
-    withSQLConf(confs: _*) {
-      // Force RDD evaluation to trigger asserts
-      dummyQueryExecution.toRdd.collect()
+    withSQLConf(SQLConf.USE_CONF_ON_RDD_OPERATION.key -> "true") {
+      val confs = Seq("spark.sql.a" -> "x", "spark.sql.b" -> "y")
+      val physicalPlan = SQLConfAssertPlan(confs)
+      val dummyQueryExecution = FakeQueryExecution(spark, physicalPlan)
+      withSQLConf(confs: _*) {
+        // Force RDD evaluation to trigger asserts
+        dummyQueryExecution.toRdd.collect()
+      }
+      val dummyQueryExecution1 = FakeQueryExecution(spark, physicalPlan)
+      // Without setting the configs assertions fail
+      val e = intercept[SparkException](dummyQueryExecution1.toRdd.collect())
+      assert(e.getCause.isInstanceOf[NoSuchElementException])
+    }
+    withSQLConf(SQLConf.USE_CONF_ON_RDD_OPERATION.key -> "false") {
+      val confs = Seq("spark.sql.a" -> "x", "spark.sql.b" -> "y")
+      val physicalPlan = SQLConfAssertPlan(confs)
+      val dummyQueryExecution = FakeQueryExecution(spark, physicalPlan)
+      withSQLConf(confs: _*) {
+        // Force RDD evaluation to trigger asserts
+        val e = intercept[SparkException](dummyQueryExecution.toRdd.collect())
+        assert(e.getCause.isInstanceOf[NoSuchElementException])
+      }
     }
-    val dummyQueryExecution1 = FakeQueryExecution(spark, physicalPlan)
-    // Without setting the configs assertions fail
-    val e = intercept[SparkException](dummyQueryExecution1.toRdd.collect())
-    assert(e.getCause.isInstanceOf[NoSuchElementException])
   }
 }