[SPARK-52134] Move execution logic to SqlScriptingExecution and enable Spark Connect path

davidm-db · cloud-fan · yhuang-db · commit dcd2b1a7eb7b · 2025-06-09T09:54:17.000-07:00
### What changes were proposed in this pull request? #### Spark Connect execution Move the script execution from `SparkSession#sql` to `QueryExecution#lazyAnalyzed`. This allows `QueryExecution` to receive the original parsed logical plan for scripting, which will be used to detect script execution in Spark Connect to treat them as commands. #### executeSqlScript refactor Moving the `executeSqlScript` logic from `SparkSession` to `SqlScriptingExecution's` object. ### Why are the changes needed? SQL Scripting improvements. ### Does this PR introduce _any_ user-facing change? No. This PR enables new functionality though (execution through Spark Connect), but the results are remaining the same. ### How was this patch tested? Already existing tests confirm that refactor of execution logic doesn't affect anything. Test added to confirm that execution through Spark Connect is not failing. ### Was this patch authored or co-authored using generative AI tooling? No. Closes apache#50895 from davidm-db/execute_sql_script_refactor. Lead-authored-by: David Milicevic <david.milicevic@databricks.com> Co-authored-by: Wenchen Fan <cloud0fan@gmail.com> Co-authored-by: Wenchen Fan <wenchen@databricks.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
diff --git a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala
@@ -54,7 +54,7 @@ import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression
 import org.apache.spark.sql.catalyst.parser.{ParseException, ParserUtils}
 import org.apache.spark.sql.catalyst.plans.{Cross, FullOuter, Inner, JoinType, LeftAnti, LeftOuter, LeftSemi, RightOuter, UsingJoin}
 import org.apache.spark.sql.catalyst.plans.logical
-import org.apache.spark.sql.catalyst.plans.logical.{AppendColumns, Assignment, CoGroup, CollectMetrics, CommandResult, Deduplicate, DeduplicateWithinWatermark, DeleteAction, DeserializeToObject, Except, FlatMapGroupsWithState, InsertAction, InsertStarAction, Intersect, JoinWith, LocalRelation, LogicalGroupState, LogicalPlan, MapGroups, MapPartitions, MergeAction, Project, Sample, SerializeFromObject, Sort, SubqueryAlias, TimeModes, TransformWithState, TypedFilter, Union, Unpivot, UnresolvedHint, UpdateAction, UpdateEventTimeWatermarkColumn, UpdateStarAction}
+import org.apache.spark.sql.catalyst.plans.logical.{AppendColumns, Assignment, CoGroup, CollectMetrics, CommandResult, CompoundBody, Deduplicate, DeduplicateWithinWatermark, DeleteAction, DeserializeToObject, Except, FlatMapGroupsWithState, InsertAction, InsertStarAction, Intersect, JoinWith, LocalRelation, LogicalGroupState, LogicalPlan, MapGroups, MapPartitions, MergeAction, Project, Sample, SerializeFromObject, Sort, SubqueryAlias, TimeModes, TransformWithState, TypedFilter, Union, Unpivot, UnresolvedHint, UpdateAction, UpdateEventTimeWatermarkColumn, UpdateStarAction}
 import org.apache.spark.sql.catalyst.streaming.InternalOutputModes
 import org.apache.spark.sql.catalyst.trees.{CurrentOrigin, Origin, TreePattern}
 import org.apache.spark.sql.catalyst.types.DataTypeUtils
@@ -2795,8 +2795,9 @@ class SparkConnectPlanner(
           s"SQL command expects either a SQL or a WithRelations, but got $other")
     }
 
-    // Check if commands have been executed.
+    // Check if command or SQL Script has been executed.
     val isCommand = df.queryExecution.commandExecuted.isInstanceOf[CommandResult]
+    val isSqlScript = df.queryExecution.logical.isInstanceOf[CompoundBody]
     val rows = df.logicalPlan match {
       case lr: LocalRelation => lr.data
       case cr: CommandResult => cr.rows
@@ -2808,7 +2809,7 @@ class SparkConnectPlanner(
     val result = SqlCommandResult.newBuilder()
     // Only filled when isCommand
     val metrics = ExecutePlanResponse.Metrics.newBuilder()
-    if (isCommand) {
+    if (isCommand || isSqlScript) {
       // Convert the results to Arrow.
       val schema = df.schema
       val maxBatchSize = (SparkEnv.get.conf.get(CONNECT_GRPC_ARROW_MAX_BATCH_SIZE) * 0.7).toLong
diff --git a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SparkConnectServerTest.scala b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SparkConnectServerTest.scala
@@ -16,16 +16,19 @@
  */
 package org.apache.spark.sql.connect
 
+import java.io.ByteArrayInputStream
 import java.util.{TimeZone, UUID}
 
 import scala.reflect.runtime.universe.TypeTag
 
 import org.apache.arrow.memory.RootAllocator
+import org.apache.arrow.vector.ipc.ArrowStreamReader
 import org.scalatest.concurrent.{Eventually, TimeLimits}
 import org.scalatest.time.Span
 import org.scalatest.time.SpanSugar._
 
 import org.apache.spark.connect.proto
+import org.apache.spark.connect.proto.ExecutePlanResponse
 import org.apache.spark.sql.catalyst.ScalaReflection
 import org.apache.spark.sql.connect.client.{CloseableIterator, CustomSparkConnectBlockingStub, ExecutePlanResponseReattachableIterator, RetryPolicy, SparkConnectClient, SparkConnectStubState}
 import org.apache.spark.sql.connect.client.arrow.ArrowSerializer
@@ -143,6 +146,21 @@ trait SparkConnectServerTest extends SharedSparkSession {
     proto.Plan.newBuilder().setRoot(dsl.sql(query)).build()
   }
 
+  protected def buildSqlCommandPlan(sqlCommand: String) = {
+    proto.Plan
+      .newBuilder()
+      .setCommand(
+        proto.Command
+          .newBuilder()
+          .setSqlCommand(
+            proto.SqlCommand
+              .newBuilder()
+              .setSql(sqlCommand)
+              .build())
+          .build())
+      .build()
+  }
+
   protected def buildLocalRelation[A <: Product: TypeTag](data: Seq[A]) = {
     val encoder = ScalaReflection.encoderFor[A]
     val arrowData =
@@ -305,4 +323,43 @@ trait SparkConnectServerTest extends SharedSparkSession {
     val plan = buildPlan(query)
     runQuery(plan, queryTimeout, iterSleep)
   }
+
+  protected def checkSqlCommandResponse(
+      result: ExecutePlanResponse.SqlCommandResult,
+      expected: Seq[Seq[Any]]): Unit = {
+    // Extract the serialized Arrow data as a byte array.
+    val dataBytes = result.getRelation.getLocalRelation.getData.toByteArray
+
+    // Create an ArrowStreamReader to deserialize the data.
+    val allocator = new RootAllocator(Long.MaxValue)
+    val inputStream = new ByteArrayInputStream(dataBytes)
+    val reader = new ArrowStreamReader(inputStream, allocator)
+
+    try {
+      // Read the schema and data.
+      val root = reader.getVectorSchemaRoot
+      // Load the first batch of data.
+      reader.loadNextBatch()
+
+      // Get dimensions.
+      val rowCount = root.getRowCount
+      val colCount = root.getFieldVectors.size
+      assert(rowCount == expected.length, "Row count mismatch")
+      assert(colCount == expected.head.length, "Column count mismatch")
+
+      // Compare to expected.
+      for (i <- 0 until rowCount) {
+        for (j <- 0 until colCount) {
+          val col = root.getFieldVectors.get(j)
+          val value = col.getObject(i)
+          print(value)
+          assert(value == expected(i)(j), s"Value mismatch at ($i, $j)")
+        }
+      }
+    } finally {
+      // Clean up resources.
+      reader.close()
+      allocator.close()
+    }
+  }
 }
diff --git a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/service/SparkConnectServiceE2ESuite.scala b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/service/SparkConnectServiceE2ESuite.scala
@@ -33,6 +33,27 @@ class SparkConnectServiceE2ESuite extends SparkConnectServerTest {
   // were all already in the buffer.
   val BIG_ENOUGH_QUERY = "select * from range(1000000)"
 
+  test("SQL Script over Spark Connect.") {
+    val sessionId = UUID.randomUUID.toString()
+    val userId = "ScriptUser"
+    val sqlScriptText =
+      """BEGIN
+        |IF 1 = 1 THEN
+        |  SELECT 1;
+        |ELSE
+        |  SELECT 2;
+        |END IF;
+        |END
+        """.stripMargin
+    withClient(sessionId = sessionId, userId = userId) { client =>
+      // this will create the session, and then ReleaseSession at the end of withClient.
+      val enableSqlScripting = client.execute(buildPlan("SET spark.sql.scripting.enabled=true"))
+      enableSqlScripting.hasNext // trigger execution
+      val query = client.execute(buildSqlCommandPlan(sqlScriptText))
+      checkSqlCommandResponse(query.next().getSqlCommandResult, Seq(Seq(1)))
+    }
+  }
+
   test("Execute is sent eagerly to the server upon iterator creation") {
     // This behavior changed with grpc upgrade from 1.56.0 to 1.59.0.
     // Testing to be aware of future changes.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/classic/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/classic/SparkSession.scala
@@ -42,10 +42,9 @@ import org.apache.spark.sql.artifact.ArtifactManager
 import org.apache.spark.sql.catalyst._
 import org.apache.spark.sql.catalyst.analysis.{NameParameterizedQuery, PosParameterizedQuery, UnresolvedRelation}
 import org.apache.spark.sql.catalyst.encoders._
-import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression}
+import org.apache.spark.sql.catalyst.expressions.AttributeReference
 import org.apache.spark.sql.catalyst.parser.ParserInterface
-import org.apache.spark.sql.catalyst.plans.logical.{CompoundBody, LocalRelation, LogicalPlan, Range}
-import org.apache.spark.sql.catalyst.types.DataTypeUtils
+import org.apache.spark.sql.catalyst.plans.logical.{CompoundBody, LocalRelation, Range}
 import org.apache.spark.sql.catalyst.types.DataTypeUtils.toAttributes
 import org.apache.spark.sql.catalyst.util.CharVarcharUtils
 import org.apache.spark.sql.classic.SparkSession.applyAndLoadExtensions
@@ -56,7 +55,6 @@ import org.apache.spark.sql.execution.datasources.LogicalRelation
 import org.apache.spark.sql.functions.lit
 import org.apache.spark.sql.internal._
 import org.apache.spark.sql.internal.StaticSQLConf.CATALOG_IMPLEMENTATION
-import org.apache.spark.sql.scripting.SqlScriptingExecution
 import org.apache.spark.sql.sources.BaseRelation
 import org.apache.spark.sql.types.{DataType, StructType}
 import org.apache.spark.sql.util.ExecutionListenerManager
@@ -432,50 +430,6 @@ class SparkSession private(
    |  Everything else  |
    * ----------------- */
 
-  /**
-   * Executes given script and return the result of the last statement.
-   * If script contains no queries, an empty `DataFrame` is returned.
-   *
-   * @param script A SQL script to execute.
-   * @param args A map of parameter names to SQL literal expressions.
-   *
-   * @return The result as a `DataFrame`.
-   */
-  private def executeSqlScript(
-      script: CompoundBody,
-      args: Map[String, Expression] = Map.empty): DataFrame = {
-    val sse = new SqlScriptingExecution(script, this, args)
-    sse.withLocalVariableManager {
-      var result: Option[Seq[Row]] = None
-
-      // We must execute returned df before calling sse.getNextResult again because sse.hasNext
-      // advances the script execution and executes all statements until the next result. We must
-      // collect results immediately to maintain execution order.
-      // This ensures we respect the contract of SqlScriptingExecution API.
-      var df: Option[DataFrame] = sse.getNextResult
-      var resultSchema: Option[StructType] = None
-      while (df.isDefined) {
-        sse.withErrorHandling {
-          // Collect results from the current DataFrame.
-          result = Some(df.get.collect().toSeq)
-          resultSchema = Some(df.get.schema)
-        }
-        df = sse.getNextResult
-      }
-
-      if (result.isEmpty) {
-        emptyDataFrame
-      } else {
-        // If `result` is defined, then `resultSchema` must be defined as well.
-        assert(resultSchema.isDefined)
-
-        val attributes = DataTypeUtils.toAttributes(resultSchema.get)
-        Dataset.ofRows(
-          self, LocalRelation.fromExternalRows(attributes, result.get))
-      }
-    }
-  }
-
   /**
    * Executes a SQL query substituting positional parameters by the given arguments,
    * returning the result as a `DataFrame`.
@@ -495,30 +449,17 @@ class SparkSession private(
     withActive {
       val plan = tracker.measurePhase(QueryPlanningTracker.PARSING) {
         val parsedPlan = sessionState.sqlParser.parsePlan(sqlText)
-        parsedPlan match {
-          case compoundBody: CompoundBody =>
-            if (args.nonEmpty) {
-              // Positional parameters are not supported for SQL scripting.
-              throw SqlScriptingErrors.positionalParametersAreNotSupportedWithSqlScripting()
-            }
-            compoundBody
-          case logicalPlan: LogicalPlan =>
-            if (args.nonEmpty) {
-              PosParameterizedQuery(logicalPlan, args.map(lit(_).expr).toImmutableArraySeq)
-            } else {
-              logicalPlan
-            }
+        if (args.nonEmpty) {
+          if (parsedPlan.isInstanceOf[CompoundBody]) {
+            // Positional parameters are not supported for SQL scripting.
+            throw SqlScriptingErrors.positionalParametersAreNotSupportedWithSqlScripting()
+          }
+          PosParameterizedQuery(parsedPlan, args.map(lit(_).expr).toImmutableArraySeq)
+        } else {
+          parsedPlan
         }
       }
-
-      plan match {
-        case compoundBody: CompoundBody =>
-          // Execute the SQL script.
-          executeSqlScript(compoundBody)
-        case logicalPlan: LogicalPlan =>
-          // Execute the standalone SQL statement.
-          Dataset.ofRows(self, plan, tracker)
-      }
+      Dataset.ofRows(self, plan, tracker)
     }
 
   /** @inheritdoc */
@@ -549,26 +490,13 @@ class SparkSession private(
     withActive {
       val plan = tracker.measurePhase(QueryPlanningTracker.PARSING) {
         val parsedPlan = sessionState.sqlParser.parsePlan(sqlText)
-        parsedPlan match {
-          case compoundBody: CompoundBody =>
-            compoundBody
-          case logicalPlan: LogicalPlan =>
-            if (args.nonEmpty) {
-              NameParameterizedQuery(logicalPlan, args.transform((_, v) => lit(v).expr))
-            } else {
-              logicalPlan
-            }
+        if (args.nonEmpty) {
+          NameParameterizedQuery(parsedPlan, args.transform((_, v) => lit(v).expr))
+        } else {
+          parsedPlan
         }
       }
-
-      plan match {
-        case compoundBody: CompoundBody =>
-          // Execute the SQL script.
-          executeSqlScript(compoundBody, args.transform((_, v) => lit(v).expr))
-        case logicalPlan: LogicalPlan =>
-          // Execute the standalone SQL statement.
-          Dataset.ofRows(self, plan, tracker)
-      }
+      Dataset.ofRows(self, plan, tracker)
     }
 
   /** @inheritdoc */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
@@ -31,10 +31,10 @@ import org.apache.spark.internal.LogKeys.EXTENDED_EXPLAIN_GENERATOR
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{AnalysisException, ExtendedExplainGenerator, Row}
 import org.apache.spark.sql.catalyst.{InternalRow, QueryPlanningTracker}
-import org.apache.spark.sql.catalyst.analysis.{LazyExpression, UnsupportedOperationChecker}
+import org.apache.spark.sql.catalyst.analysis.{LazyExpression, NameParameterizedQuery, UnsupportedOperationChecker}
 import org.apache.spark.sql.catalyst.expressions.codegen.ByteCodeStats
 import org.apache.spark.sql.catalyst.plans.QueryPlan
-import org.apache.spark.sql.catalyst.plans.logical.{AppendData, Command, CommandResult, CreateTableAsSelect, LogicalPlan, OverwriteByExpression, OverwritePartitionsDynamic, ReplaceTableAsSelect, ReturnAnswer, Union}
+import org.apache.spark.sql.catalyst.plans.logical.{AppendData, Command, CommandResult, CompoundBody, CreateTableAsSelect, LogicalPlan, OverwriteByExpression, OverwritePartitionsDynamic, ReplaceTableAsSelect, ReturnAnswer, Union}
 import org.apache.spark.sql.catalyst.rules.{PlanChangeLogger, Rule}
 import org.apache.spark.sql.catalyst.util.StringUtils.PlanStringConcat
 import org.apache.spark.sql.catalyst.util.truncatedString
@@ -46,6 +46,7 @@ import org.apache.spark.sql.execution.exchange.EnsureRequirements
 import org.apache.spark.sql.execution.reuse.ReuseExchangeAndSubquery
 import org.apache.spark.sql.execution.streaming.{IncrementalExecution, OffsetSeqMetadata, WatermarkPropagator}
 import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.scripting.SqlScriptingExecution
 import org.apache.spark.sql.streaming.OutputMode
 import org.apache.spark.util.{LazyTry, Utils}
 import org.apache.spark.util.ArrayImplicits._
@@ -93,16 +94,26 @@ class QueryExecution(
   }
 
   private val lazyAnalyzed = LazyTry {
+    val withScriptExecuted = logical match {
+      // Execute the SQL script. Script doesn't need to go through the analyzer as Spark will run
+      // each statement as individual query.
+      case NameParameterizedQuery(compoundBody: CompoundBody, argNames, argValues) =>
+        val args = argNames.zip(argValues).toMap
+        SqlScriptingExecution.executeSqlScript(sparkSession, compoundBody, args)
+      case compoundBody: CompoundBody =>
+        SqlScriptingExecution.executeSqlScript(sparkSession, compoundBody)
+      case _ => logical
+    }
     try {
       val plan = executePhase(QueryPlanningTracker.ANALYSIS) {
         // We can't clone `logical` here, which will reset the `_analyzed` flag.
-        sparkSession.sessionState.analyzer.executeAndCheck(logical, tracker)
+        sparkSession.sessionState.analyzer.executeAndCheck(withScriptExecuted, tracker)
       }
       tracker.setAnalyzed(plan)
       plan
     } catch {
       case NonFatal(e) =>
-        tracker.setAnalysisFailed(logical)
+        tracker.setAnalysisFailed(withScriptExecuted)
         throw e
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/scripting/SqlScriptingExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/scripting/SqlScriptingExecution.scala