fix calls

pang-wu · pang-wu · commit 7ccd0f2604a0 · 2025-12-23T20:13:14.000-08:00
diff --git a/core/raydp-main/src/main/java/org/apache/spark/deploy/raydp/RayAppMasterUtils.java b/core/raydp-main/src/main/java/org/apache/spark/deploy/raydp/RayAppMasterUtils.java
@@ -38,9 +38,11 @@ public static ActorHandle<RayAppMaster> createAppMaster(
     jvmOptions.add("-cp");
     jvmOptions.add(cp);
     creator.setJvmOptions(jvmOptions);
-    for(Map.Entry<String, Double> resource : appMasterResource.entrySet()) {
-      String resourceName = resource.getKey()
-              .substring(SparkOnRayConfigs.SPARK_MASTER_ACTOR_RESOURCE_PREFIX.length() + 1);
+
+    for (Map.Entry<String, Double> resource : appMasterResource.entrySet()) {
+      String key = resource.getKey();
+      String resourceName = key.substring(
+          SparkOnRayConfigs.SPARK_MASTER_ACTOR_RESOURCE_PREFIX.length() + 1);
       creator.setResource(resourceName, resource.getValue());
     }
 
diff --git a/core/raydp-main/src/main/java/org/apache/spark/raydp/RayDPUtils.java b/core/raydp-main/src/main/java/org/apache/spark/raydp/RayDPUtils.java
@@ -17,11 +17,13 @@
 
 package org.apache.spark.raydp;
 
+import io.ray.api.ActorHandle;
 import io.ray.api.ObjectRef;
 import io.ray.api.Ray;
 import io.ray.api.id.ObjectId;
 import io.ray.runtime.AbstractRayRuntime;
 import io.ray.runtime.object.ObjectRefImpl;
+import org.apache.spark.deploy.raydp.RayAppMaster;
 
 public class RayDPUtils {
 
@@ -51,4 +53,13 @@ public static <T> ObjectRef<T> readBinary(byte[] obj, Class<T> clazz, byte[] own
     );
     return ref;
   }
+
+  /**
+   * Helper to invoke putDatasetBlock on RayAppMaster using a Java method
+   * reference, so Ray can serialize the function as a proper Java lambda.
+   */
+  public static ObjectRef<ObjectRef<byte[]>> putDatasetBlockAsync(
+      ActorHandle<RayAppMaster> handle, byte[] data) {
+    return handle.task(RayAppMaster::putDatasetBlock, data).remote();
+  }
 }
diff --git a/core/raydp-main/src/main/scala/org/apache/spark/deploy/raydp/AppMasterJavaBridge.scala b/core/raydp-main/src/main/scala/org/apache/spark/deploy/raydp/AppMasterJavaBridge.scala
@@ -62,17 +62,10 @@ class AppMasterJavaBridge {
         case (k, v) => k.startsWith(SparkOnRayConfigs.SPARK_MASTER_ACTOR_RESOURCE_PREFIX)
       }.map{ case (k, v) => k->double2Double(v.toString.toDouble) }.asJava
 
-      // Owner transfer concurrency is configured via a dedicated key and should
-      // not be bundled with the generic AppMaster resources.
-      val ownerTransferConcurrency = Option(
-        sparkProps.get(SparkOnRayConfigs.SPARK_MASTER_OWNER_TRANSFER_CONCURRENCY))
-        .map(_.toString.toInt).getOrElse(0)
-
       handle = RayAppMasterUtils.createAppMaster(
           extra_cp, name,
           (sparkJvmOptions ++ Seq(SparkOnRayConfigs.RAYDP_LOGFILE_PREFIX_CFG)).asJava,
-          appMasterResources,
-          ownerTransferConcurrency)
+          appMasterResources)
     }
   }
 
diff --git a/core/raydp-main/src/main/scala/org/apache/spark/scheduler/cluster/raydp/RayCoarseGrainedSchedulerBackend.scala b/core/raydp-main/src/main/scala/org/apache/spark/scheduler/cluster/raydp/RayCoarseGrainedSchedulerBackend.scala
@@ -91,13 +91,8 @@ class RayCoarseGrainedSchedulerBackend(
           case (k, v) => k.startsWith(SparkOnRayConfigs.SPARK_MASTER_ACTOR_RESOURCE_PREFIX)
         }.map{ case (k, v) => k->double2Double(v.toDouble) }
 
-        // Owner transfer concurrency is configured via a dedicated key and should
-        // not be bundled with the generic AppMaster resources.
-        val ownerTransferConcurrency =
-          conf.get(SparkOnRayConfigs.SPARK_MASTER_OWNER_TRANSFER_CONCURRENCY, "0").toInt
-
         masterHandle = RayAppMasterUtils.createAppMaster(cp, null, options.toBuffer.asJava,
-          appMasterResources.toMap.asJava, ownerTransferConcurrency)
+          appMasterResources.toMap.asJava)
         uri = new URI(RayAppMasterUtils.getMasterUrl(masterHandle))
       } else {
         uri = new URI(sparkUrl)
diff --git a/core/raydp-main/src/main/scala/org/apache/spark/sql/raydp/ObjectStoreWriter.scala b/core/raydp-main/src/main/scala/org/apache/spark/sql/raydp/ObjectStoreWriter.scala
@@ -65,10 +65,12 @@ class ObjectStoreWriter(@transient val df: DataFrame) extends Serializable {
       ownerName: String): RecordBatch = {
 
     // NOTE: We intentionally do NOT pass an owner argument to Ray.put anymore.
-    // Instead, we route all puts through the long-lived RayAppMaster actor,
-    // so that object ownership is decoupled from individual Spark executors.
-    val objectRef: ObjectRef[Array[Byte]] =
-      ObjectStoreWriter.putViaOwner(data)
+    // The default JVM path puts the serialized Arrow batch into Ray's object store
+    // from the Spark executor JVM process.
+    //
+    // Ownership transfer to a long-lived Python actor is implemented on the Python side
+    // by "adopting" (re-putting) these ObjectRefs inside the target actor.
+    val objectRef: ObjectRef[Array[Byte]] = Ray.put(data)
 
     // add the objectRef to the objectRefHolder to avoid reference GC
     queue.add(objectRef)
diff --git a/python/raydp/spark/dataset.py b/python/raydp/spark/dataset.py
@@ -15,7 +15,7 @@
 # limitations under the License.
 import logging
 import uuid
-from typing import Callable, Dict, List, NoReturn, Optional, Iterable, Union
+from typing import Callable, List, Optional, Union
 from dataclasses import dataclass
 
 import pandas as pd
@@ -103,7 +103,9 @@ def get_raydp_master_owner(spark: Optional[SparkSession] = None) -> PartitionObj
     def raydp_master_set_reference_as_state(
             raydp_master_actor: ray.actor.ActorHandle,
             objects: List[ObjectRef]) -> ObjectRef:
-        return raydp_master_actor.add_objects.remote(uuid.uuid4(), objects)
+        # Adopt objects in the Python master actor so it becomes the owner of the
+        # dataset blocks without using Ray.put `_owner`.
+        return raydp_master_actor.adopt_objects.remote(uuid.uuid4(), objects)
 
     return PartitionObjectsOwner(
         obj_holder_name,
@@ -141,7 +143,10 @@ def _save_spark_df_to_object_store(df: sql.DataFrame, use_batch: bool = True,
 
     if owner is not None:
         actor_owner = ray.get_actor(actor_owner_name)
-        ray.get(owner.set_reference_as_state(actor_owner, blocks))
+        adopted = ray.get(owner.set_reference_as_state(actor_owner, blocks))
+        # If the owner callback returns a new list of refs (adoption), use it.
+        if adopted is not None:
+            blocks = adopted
 
     return blocks, block_sizes
 
diff --git a/python/raydp/spark/ray_cluster_master.py b/python/raydp/spark/ray_cluster_master.py
@@ -224,6 +224,18 @@ def get_spark_home(self) -> str:
     def add_objects(self, timestamp, objects):
         self._objects[timestamp] = objects
 
+    def adopt_objects(self, timestamp, objects):
+        """Adopt objects by re-putting them inside this actor.
+
+        This makes this actor the owner of the newly created objects without
+        using the Ray.put `_owner` argument.
+
+        Returns the new ObjectRefs.
+        """
+        new_objects = [ray.put(ray.get(obj)) for obj in objects]
+        self._objects[timestamp] = new_objects
+        return new_objects
+
     def get_object(self, timestamp, idx):
         return self._objects[timestamp][idx]
 
diff --git a/python/raydp/tests/test_data_owner_transfer.py b/python/raydp/tests/test_data_owner_transfer.py
@@ -154,6 +154,11 @@ def wake(self):
       def set_objects(self, objects):
           self.objects = objects
 
+      def adopt_objects(self, objects):
+          # Re-put inside this actor so this actor becomes the owner of the new objects.
+          self.objects = [ray.put(ray.get(o)) for o in objects]
+          return self.objects
+
   if ray_client.ray.is_connected():
       pytest.skip("Skip this test if using ray client")
 
@@ -185,7 +190,7 @@ def set_objects(self, objects):
   # and transfer data ownership to dedicated Object Holder (Singleton)
   ds = spark_dataframe_to_ray_dataset(df_train, parallelism=4, owner=PartitionObjectsOwner(
       owner_actor_name,
-      lambda actor, objects: actor.set_objects.remote(objects)))
+      lambda actor, objects: actor.adopt_objects.remote(objects)))
 
   # display data
   ds.show(5)
@@ -226,6 +231,7 @@ def test_api_compatibility(ray_cluster, jdk17_extra_spark_configs):
 
   # check compatibility of ray 1.9.0 API: no data onwership transfer
   ds = ray.data.from_spark(df_train)
+  ds.show(1)
   ray_gc() # ensure GC kicked in
   time.sleep(3)