apache · maomaodev · May 8, 2026 · May 8, 2026
diff --git a/...-hive/src/main/scala/org/apache/kyuubi/spark/connector/hive/KyuubiHiveConnectorConf.scala b/...-hive/src/main/scala/org/apache/kyuubi/spark/connector/hive/KyuubiHiveConnectorConf.scala
@@ -65,4 +65,12 @@ object KyuubiHiveConnectorConf {
       .version("1.12.0")
       .booleanConf
       .createWithDefault(false)
+
+  val READ_RUNTIME_FILTER_ENABLED =
+    buildConf("spark.sql.kyuubi.hive.connector.read.runtimeFilter.enabled")
+      .doc("When enabled, partition columns will be exposed as runtime filter attributes, " +
+        "this is required for Spark Dynamic Partition Pruning (DPP).")
+      .version("1.12.0")
+      .booleanConf
+      .createWithDefault(true)
 }
diff --git a/...ector-hive/src/main/scala/org/apache/kyuubi/spark/connector/hive/read/HiveFileIndex.scala b/...ector-hive/src/main/scala/org/apache/kyuubi/spark/connector/hive/read/HiveFileIndex.scala
@@ -52,6 +52,18 @@ class HiveCatalogFileIndex(
 
   private val baseLocation: Option[URI] = table.storage.locationUri
 
+  // Align with Spark's built-in CatalogFileIndex by explicitly overriding equals.
+  // This keeps `BatchScanExec#equals` stable and enables BroadcastExchange reuse under DPP.
+  override def equals(other: Any): Boolean = other match {
+    case that: HiveCatalogFileIndex =>
+      this.hiveCatalog.name == that.hiveCatalog.name &&
+      this.catalogTable.identifier == that.catalogTable.identifier
+    case _ => false
+  }
+
+  override def hashCode(): Int =
+    31 * hiveCatalog.name.hashCode + catalogTable.identifier.hashCode
+
   override def partitionSchema: StructType = table.partitionSchema
 
   override def listFiles(

diff --git a/...src/main/scala/org/apache/kyuubi/spark/connector/hive/read/HiveRuntimeFilterSupport.scala b/...src/main/scala/org/apache/kyuubi/spark/connector/hive/read/HiveRuntimeFilterSupport.scala
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.kyuubi.spark.connector.hive.read
+
+import java.util.Locale
+
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression, InSet}
+import org.apache.spark.sql.connector.expressions.{Expressions, Literal => V2Literal, NamedReference}
+import org.apache.spark.sql.connector.expressions.filter.Predicate
+import org.apache.spark.sql.hive.kyuubi.connector.HiveBridgeHelper.{sameType, StructTypeHelper}
+import org.apache.spark.sql.types.StructType
+
+/**
+ * Helpers for a Hive-backed V2 [[org.apache.spark.sql.connector.read.Scan]] to
+ * implement [[org.apache.spark.sql.connector.read.SupportsRuntimeV2Filtering]]
+ * for Dynamic Partition Pruning (DPP).
+ *
+ * Spark's `DataSourceV2Strategy` currently only emits the `IN` form of V2
+ * [[Predicate]] as a DPP runtime filter, so translation here handles `IN` only.
+ * Any predicate that does not match the expected shape (single partition column
+ * ref + scalar literals with matching dataType) is dropped as a whole to
+ * avoid incorrect pruning; drops are logged at DEBUG.
+ */
+object HiveRuntimeFilterSupport extends Logging {
+
+  /**
+   * Build the runtime-filterable attribute array. Only partition columns are exposed
+   * because DPP is only beneficial at the partition directory granularity.
+   */
+  def filterAttributes(partitionColumnNames: Seq[String]): Array[NamedReference] = {
+    partitionColumnNames.map(Expressions.column).toArray
+  }
+
+  /**
+   * Translate Spark's runtime V2 `IN` predicates into catalyst `InSet(attr, Set[Any])`
+   * expressions bound to the given partition attributes.
+   */
+  def toCatalystPartitionFilters(
+      predicates: Array[Predicate],
+      partitionSchema: StructType,
+      isCaseSensitive: Boolean): Seq[Expression] = {
+    val attrByName: Map[String, AttributeReference] =
+      partitionSchema.toAttributes
+        .map(a => normalize(a.name, isCaseSensitive) -> a).toMap
+
+    val accepted = predicates.toSeq.flatMap(p => convertIn(p, attrByName, isCaseSensitive))
+    if (accepted.length < predicates.length) {
+      logDebug(
+        s"Dropped ${predicates.length - accepted.length} of ${predicates.length} runtime " +
+          s"filter(s) not applicable to partition columns " +
+          s"[${partitionSchema.fieldNames.mkString(", ")}]")
+    }
+    accepted
+  }
+
+  /**
+   * Convert a single V2 `IN` predicate into a catalyst [[InSet]], or `None` if it cannot
+   * be safely converted. A predicate is accepted only when its first child is a
+   * [[NamedReference]] resolving to a known partition column and every remaining child
+   * is a scalar V2 [[V2Literal]] whose dataType matches the partition column's dataType.
+   */
+  private def convertIn(
+      predicate: Predicate,
+      attrByName: Map[String, AttributeReference],
+      isCaseSensitive: Boolean): Option[InSet] = {
+    val children = predicate.children()
+    if (predicate.name() != "IN" || children.length < 2) {
+      None
+    } else {
+      children.head match {
+        case ref: NamedReference =>
+          val colName = normalize(ref.fieldNames().mkString("."), isCaseSensitive)
+          attrByName.get(colName).flatMap { attr =>
+            val values = children.tail
+            val allLiteralsMatch = values.forall {
+              case lit: V2Literal[_] => sameType(lit.dataType(), attr.dataType)
+              case _ => false
+            }
+            if (allLiteralsMatch) {
+              val literalValues = values.iterator.map {
+                case lit: V2Literal[_] => lit.value()
+              }.toSet[Any]
+              Some(InSet(attr, literalValues))
+            } else {
+              None
+            }
+          }
+        case _ => None
+      }
+    }
+  }
+
+  private def normalize(name: String, isCaseSensitive: Boolean): String =
+    if (isCaseSensitive) {
+      name
+    } else {
+      name.toLowerCase(Locale.ROOT)
+    }
+}
diff --git a/...-connector-hive/src/main/scala/org/apache/kyuubi/spark/connector/hive/read/HiveScan.scala b/...-connector-hive/src/main/scala/org/apache/kyuubi/spark/connector/hive/read/HiveScan.scala
@@ -27,7 +27,9 @@ import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable, CatalogTablePartition}
 import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression}
 import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection
-import org.apache.spark.sql.connector.read.PartitionReaderFactory
+import org.apache.spark.sql.connector.expressions.NamedReference
+import org.apache.spark.sql.connector.expressions.filter.Predicate
+import org.apache.spark.sql.connector.read.{PartitionReaderFactory, SupportsRuntimeV2Filtering}
 import org.apache.spark.sql.execution.datasources.{FilePartition, PartitionedFile}
 import org.apache.spark.sql.execution.datasources.v2.FileScan
 import org.apache.spark.sql.hive.kyuubi.connector.HiveBridgeHelper.HiveClientImpl
@@ -36,6 +38,7 @@ import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.SerializableConfiguration
 
 import org.apache.kyuubi.spark.connector.hive.{HiveConnectorUtils, KyuubiHiveConnectorException}
+import org.apache.kyuubi.spark.connector.hive.KyuubiHiveConnectorConf.READ_RUNTIME_FILTER_ENABLED
 
 case class HiveScan(
     sparkSession: SparkSession,
@@ -46,13 +49,29 @@ case class HiveScan(
     readPartitionSchema: StructType,
     pushedFilters: Array[Filter] = Array.empty,
     partitionFilters: Seq[Expression] = Seq.empty,
-    dataFilters: Seq[Expression] = Seq.empty) extends FileScan {
+    dataFilters: Seq[Expression] = Seq.empty) extends FileScan
+  with SupportsRuntimeV2Filtering {
 
   private val isCaseSensitive = sparkSession.sessionState.conf.caseSensitiveAnalysis
 
   private val partFileToHivePartMap: mutable.Map[PartitionedFile, CatalogTablePartition] =
     mutable.Map()
 
+  private var runtimeFilters: Seq[Expression] = Seq.empty
+
+  // Align with Spark's built-in ParquetScan/OrcScan by explicitly overriding equals.
+  // This keeps `BatchScanExec#equals` stable and enables BroadcastExchange reuse under DPP.
+  override def equals(obj: Any): Boolean = obj match {
+    case other: HiveScan =>
+      super.equals(other) &&
+      catalogTable.identifier == other.catalogTable.identifier &&
+      dataSchema == other.dataSchema &&
+      equivalentFilters(pushedFilters, other.pushedFilters)
+    case _ => false
+  }
+
+  override def hashCode(): Int = getClass.hashCode()
+
   override def isSplitable(path: Path): Boolean = {
     catalogTable.provider.map(_.toUpperCase(Locale.ROOT)).exists {
       case "PARQUET" => true
@@ -83,8 +102,9 @@ case class HiveScan(
   }
 
   override protected def partitions: Seq[FilePartition] = {
+    val effectivePartitionFilters = partitionFilters ++ runtimeFilters
     val (selectedPartitions, partDirToHivePartMap) =
-      fileIndex.listHiveFiles(partitionFilters, dataFilters)
+      fileIndex.listHiveFiles(effectivePartitionFilters, dataFilters)
     val maxSplitBytes = FilePartition.maxSplitBytes(sparkSession, selectedPartitions)
     val partitionAttributes = toAttributes(fileIndex.partitionSchema)
     val attributeMap = partitionAttributes.map(a => normalizeName(a.name) -> a).toMap
@@ -157,4 +177,29 @@ case class HiveScan(
 
   def toAttributes(structType: StructType): Seq[AttributeReference] =
     structType.map(f => AttributeReference(f.name, f.dataType, f.nullable, f.metadata)())
+
+  // -------------------------------------------------------------------------------
+  // SupportsRuntimeV2Filtering implementation
+  // -------------------------------------------------------------------------------
+
+  override def filterAttributes(): Array[NamedReference] = {
+    if (!sparkSession.sessionState.conf.getConf(READ_RUNTIME_FILTER_ENABLED)) {
+      Array.empty
+    } else {
+      HiveRuntimeFilterSupport.filterAttributes(readPartitionSchema.fieldNames.toSeq)
+    }
+  }
+
+  override def filter(predicates: Array[Predicate]): Unit = {
+    runtimeFilters = HiveRuntimeFilterSupport.toCatalystPartitionFilters(
+      predicates,
+      fileIndex.partitionSchema,
+      isCaseSensitive)
+    if (runtimeFilters.nonEmpty) {
+      logInfo(s"Received ${runtimeFilters.length} runtime partition filter(s) for " +
+        s"${catalogTable.identifier}")
+      logDebug(s"Runtime partition filter(s) for ${catalogTable.identifier}: " +
+        s"${runtimeFilters.mkString(", ")}")
+    }
+  }
 }
diff --git a/...tor-hive/src/main/scala/org/apache/spark/sql/hive/kyuubi/connector/HiveBridgeHelper.scala b/...tor-hive/src/main/scala/org/apache/spark/sql/hive/kyuubi/connector/HiveBridgeHelper.scala
@@ -52,6 +52,8 @@ object HiveBridgeHelper {
     sc.listenerBus.post(event)
   }
 
+  def sameType(left: DataType, right: DataType): Boolean = left.sameType(right)
+
   implicit class TransformHelper(transforms: Seq[Transform]) {
     def convertTransforms: (Seq[String], Option[BucketSpec]) = {
       val identityCols = new mutable.ArrayBuffer[String]

diff --git a/.../src/test/scala/org/apache/kyuubi/spark/connector/hive/DynamicPartitionPruningSuite.scala b/.../src/test/scala/org/apache/kyuubi/spark/connector/hive/DynamicPartitionPruningSuite.scala
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.kyuubi.spark.connector.hive
+
+import scala.annotation.tailrec
+
+import org.apache.spark.sql.{Row, SparkSession}
+import org.apache.spark.sql.connector.read.{Scan, SupportsRuntimeV2Filtering}
+import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec
+import org.apache.spark.sql.execution.datasources.v2.BatchScanExec
+
+class DynamicPartitionPruningSuite extends KyuubiHiveTest {
+
+  private def findScan(spark: SparkSession, sql: String, tableNameHint: String): Scan = {
+    @tailrec
+    def findBatchScan(plan: SparkPlan): Option[BatchScanExec] = plan match {
+      case aqe: AdaptiveSparkPlanExec => findBatchScan(aqe.inputPlan)
+      case _ => plan.collectFirst {
+          case b: BatchScanExec if b.toString.contains(tableNameHint) => b
+        }
+    }
+    val exec = findBatchScan(spark.sql(sql).queryExecution.executedPlan)
+    assert(exec.isDefined)
+    exec.get.scan
+  }
+
+  test("HiveScan supports DPP runtime filtering on partition columns") {
+    Seq(
+      ("true", Seq("dt")),
+      ("false", Seq.empty[String])).foreach { case (enabled, expectedFilterAttrs) =>
+      withSparkSession(Map(
+        "hive.exec.dynamic.partition.mode" -> "nonstrict",
+        "spark.sql.kyuubi.hive.connector.read.runtimeFilter.enabled" -> enabled)) { spark =>
+        val suffix = if (enabled == "true") "on" else "off"
+        val fact = s"hive.default.dpp_fact_$suffix"
+        val dim = s"hive.default.dpp_dim_$suffix"
+
+        withTable(fact, dim) {
+          spark.sql(
+            s"""
+               | CREATE TABLE $fact (id INT, v STRING) PARTITIONED BY (dt STRING)
+               | STORED AS TEXTFILE
+               |""".stripMargin).collect()
+          spark.sql(s"INSERT INTO $fact PARTITION (dt='2026-01-01') VALUES (1, 'a'), (2, 'b')")
+          spark.sql(s"INSERT INTO $fact PARTITION (dt='2026-05-01') VALUES (3, 'c'), (4, 'd')")
+          spark.sql(s"INSERT INTO $fact PARTITION (dt='2026-09-01') VALUES (5, 'e'), (6, 'f')")
+
+          spark.sql(
+            s"""
+               | CREATE TABLE $dim (dt STRING, tag STRING)
+               | STORED AS TEXTFILE
+               |""".stripMargin).collect()
+          spark.sql(s"INSERT INTO $dim VALUES ('2026-05-01', 'target')")
+
+          val sql =
+            s"""
+               | SELECT f.id, f.v, f.dt
+               | FROM $fact f JOIN $dim d ON f.dt = d.dt
+               | WHERE d.tag = 'target'
+               |""".stripMargin
+
+          checkAnswer(
+            spark.sql(sql),
+            Seq(
+              Row(3, "c", "2026-05-01"),
+              Row(4, "d", "2026-05-01")))
+
+          val scan = findScan(spark, sql, fact.split('.').last)
+          assert(scan.isInstanceOf[SupportsRuntimeV2Filtering])
+          val filterAttrs = scan.asInstanceOf[SupportsRuntimeV2Filtering]
+            .filterAttributes().map(_.fieldNames().mkString("."))
+          assert(filterAttrs.toSeq == expectedFilterAttrs)
+        }
+      }
+    }
+  }
+}