remove usage of ray.put(data, owner)

pang-wu · pang-wu · commit 30de99c8800b · 2025-12-23T20:16:54.000-08:00
diff --git a/core/raydp-main/src/main/java/org/apache/spark/raydp/SparkOnRayConfigs.java b/core/raydp-main/src/main/java/org/apache/spark/raydp/SparkOnRayConfigs.java
@@ -10,6 +10,16 @@ public class SparkOnRayConfigs {
     public static final String SPARK_MASTER_ACTOR_RESOURCE_PREFIX =
             "spark.ray.raydp_spark_master.actor.resource";
 
+    /**
+     * Concurrency (max parallelism) for data owner transfer operations, i.e.
+     * how many concurrent putDatasetBlock calls RayAppMaster can handle.
+     *
+     * Example usage:
+     * spark.ray.raydp_spark_master.actor.owner_transfer_concurrency=4
+     */
+    public static final String SPARK_MASTER_OWNER_TRANSFER_CONCURRENCY =
+            "spark.ray.raydp_spark_master.actor.owner_transfer_concurrency";
+
     /**
      * Extra JVM options for the RayDP AppMaster actor and gateway process.
      * This is useful for passing JDK 17+ --add-opens flags.
diff --git a/core/raydp-main/src/main/scala/org/apache/spark/sql/raydp/ObjectStoreWriter.scala b/core/raydp-main/src/main/scala/org/apache/spark/sql/raydp/ObjectStoreWriter.scala
@@ -18,10 +18,10 @@
 package org.apache.spark.sql.raydp
 
 import com.intel.raydp.shims.SparkShimLoader
-import io.ray.api.{ActorHandle, ObjectRef, PyActorHandle, Ray}
+import io.ray.api.{ActorHandle, ObjectRef, Ray}
 import io.ray.runtime.AbstractRayRuntime
 import java.io.ByteArrayOutputStream
-import java.util.{List, UUID}
+import java.util.{List, Optional, UUID}
 import java.util.concurrent.{ConcurrentHashMap, ConcurrentLinkedQueue}
 import java.util.function.{Function => JFunction}
 import org.apache.arrow.vector.VectorSchemaRoot
@@ -64,13 +64,13 @@ class ObjectStoreWriter(@transient val df: DataFrame) extends Serializable {
       queue: ObjectRefHolder.Queue,
       ownerName: String): RecordBatch = {
 
-    var objectRef: ObjectRef[Array[Byte]] = null
-    if (ownerName == "") {
-      objectRef = Ray.put(data)
-    } else {
-      var dataOwner: PyActorHandle = Ray.getActor(ownerName).get()
-      objectRef = Ray.put(data, dataOwner)
-    }
+    // NOTE: We intentionally do NOT pass an owner argument to Ray.put anymore.
+    // The default JVM path puts the serialized Arrow batch into Ray's object store
+    // from the Spark executor JVM process.
+    //
+    // Ownership transfer to a long-lived Python actor is implemented on the Python side
+    // by "adopting" (re-putting) these ObjectRefs inside the target actor.
+    val objectRef: ObjectRef[Array[Byte]] = Ray.put(data)
 
     // add the objectRef to the objectRefHolder to avoid reference GC
     queue.add(objectRef)
diff --git a/python/raydp/spark/dataset.py b/python/raydp/spark/dataset.py
@@ -15,7 +15,7 @@
 # limitations under the License.
 import logging
 import uuid
-from typing import Callable, Dict, List, NoReturn, Optional, Iterable, Union
+from typing import Callable, List, Optional, Union
 from dataclasses import dataclass
 
 import pandas as pd
@@ -103,7 +103,9 @@ def get_raydp_master_owner(spark: Optional[SparkSession] = None) -> PartitionObj
     def raydp_master_set_reference_as_state(
             raydp_master_actor: ray.actor.ActorHandle,
             objects: List[ObjectRef]) -> ObjectRef:
-        return raydp_master_actor.add_objects.remote(uuid.uuid4(), objects)
+        # Adopt objects in the Python master actor so it becomes the owner of the
+        # dataset blocks without using Ray.put `_owner`.
+        return raydp_master_actor.adopt_objects.remote(uuid.uuid4(), objects)
 
     return PartitionObjectsOwner(
         obj_holder_name,
@@ -141,7 +143,10 @@ def _save_spark_df_to_object_store(df: sql.DataFrame, use_batch: bool = True,
 
     if owner is not None:
         actor_owner = ray.get_actor(actor_owner_name)
-        ray.get(owner.set_reference_as_state(actor_owner, blocks))
+        adopted = ray.get(owner.set_reference_as_state(actor_owner, blocks))
+        # If the owner callback returns a new list of refs (adoption), use it.
+        if adopted is not None:
+            blocks = adopted
 
     return blocks, block_sizes
 
diff --git a/python/raydp/spark/ray_cluster_master.py b/python/raydp/spark/ray_cluster_master.py
@@ -224,6 +224,18 @@ def get_spark_home(self) -> str:
     def add_objects(self, timestamp, objects):
         self._objects[timestamp] = objects
 
+    def adopt_objects(self, timestamp, objects):
+        """Adopt objects by re-putting them inside this actor.
+
+        This makes this actor the owner of the newly created objects without
+        using the Ray.put `_owner` argument.
+
+        Returns the new ObjectRefs.
+        """
+        new_objects = [ray.put(ray.get(obj)) for obj in objects]
+        self._objects[timestamp] = new_objects
+        return new_objects
+
     def get_object(self, timestamp, idx):
         return self._objects[timestamp][idx]
 
diff --git a/python/raydp/tests/test_data_owner_transfer.py b/python/raydp/tests/test_data_owner_transfer.py
@@ -154,6 +154,11 @@ def wake(self):
       def set_objects(self, objects):
           self.objects = objects
 
+      def adopt_objects(self, objects):
+          # Re-put inside this actor so this actor becomes the owner of the new objects.
+          self.objects = [ray.put(ray.get(o)) for o in objects]
+          return self.objects
+
   if ray_client.ray.is_connected():
       pytest.skip("Skip this test if using ray client")
 
@@ -185,7 +190,7 @@ def set_objects(self, objects):
   # and transfer data ownership to dedicated Object Holder (Singleton)
   ds = spark_dataframe_to_ray_dataset(df_train, parallelism=4, owner=PartitionObjectsOwner(
       owner_actor_name,
-      lambda actor, objects: actor.set_objects.remote(objects)))
+      lambda actor, objects: actor.adopt_objects.remote(objects)))
 
   # display data
   ds.show(5)
@@ -226,6 +231,7 @@ def test_api_compatibility(ray_cluster, jdk17_extra_spark_configs):
 
   # check compatibility of ray 1.9.0 API: no data onwership transfer
   ds = ray.data.from_spark(df_train)
+  ds.show(1)
   ray_gc() # ensure GC kicked in
   time.sleep(3)