llvm · jhuber6 · Sep 21, 2023 · Sep 5, 2023
diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt
@@ -117,7 +117,6 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.time.nanosleep
 
     # gpu/rpc.h entrypoints
-    libc.src.gpu.rpc_reset
     libc.src.gpu.rpc_host_call
 )
 

diff --git a/libc/spec/gpu_ext.td b/libc/spec/gpu_ext.td
@@ -5,11 +5,6 @@ def GPUExtensions : StandardSpec<"GPUExtensions"> {
     [], // Types
     [], // Enumerations
     [
-        FunctionSpec<
-            "rpc_reset",
-            RetValSpec<VoidType>,
-            [ArgSpec<UnsignedIntType>, ArgSpec<VoidPtr>]
-        >,
         FunctionSpec<
             "rpc_host_call",
             RetValSpec<VoidType>,

diff --git a/libc/src/__support/RPC/rpc.h b/libc/src/__support/RPC/rpc.h
@@ -88,20 +88,13 @@ template <bool Invert, typename Packet> struct Process {
   static constexpr uint64_t NUM_BITS_IN_WORD = sizeof(uint32_t) * 8;
   cpp::Atomic<uint32_t> lock[MAX_PORT_COUNT / NUM_BITS_IN_WORD] = {0};
 
-  /// Initialize the communication channels.
-  LIBC_INLINE void reset(uint32_t port_count, void *buffer) {
-    this->port_count = port_count;
-    this->inbox = reinterpret_cast<cpp::Atomic<uint32_t> *>(
-        advance(buffer, inbox_offset(port_count)));
-    this->outbox = reinterpret_cast<cpp::Atomic<uint32_t> *>(
-        advance(buffer, outbox_offset(port_count)));
-    this->packet =
-        reinterpret_cast<Packet *>(advance(buffer, buffer_offset(port_count)));
-  }
-
-  /// Returns the beginning of the unified buffer. Intended for initializing the
-  /// client after the server has been started.
-  LIBC_INLINE void *get_buffer_start() const { return Invert ? outbox : inbox; }
+  LIBC_INLINE Process(uint32_t port_count, void *buffer)
+      : port_count(port_count), inbox(reinterpret_cast<cpp::Atomic<uint32_t> *>(
+                                    advance(buffer, inbox_offset(port_count)))),
+        outbox(reinterpret_cast<cpp::Atomic<uint32_t> *>(
+            advance(buffer, outbox_offset(port_count)))),
+        packet(reinterpret_cast<Packet *>(
+            advance(buffer, buffer_offset(port_count)))) {}
 
   /// Allocate a memory buffer sufficient to store the following equivalent
   /// representation in memory.
@@ -116,13 +109,13 @@ template <bool Invert, typename Packet> struct Process {
   }
 
   /// Retrieve the inbox state from memory shared between processes.
-  LIBC_INLINE uint32_t load_inbox(uint64_t lane_mask, uint32_t index) {
+  LIBC_INLINE uint32_t load_inbox(uint64_t lane_mask, uint32_t index) const {
     return gpu::broadcast_value(lane_mask,
                                 inbox[index].load(cpp::MemoryOrder::RELAXED));
   }
 
   /// Retrieve the outbox state from memory shared between processes.
-  LIBC_INLINE uint32_t load_outbox(uint64_t lane_mask, uint32_t index) {
+  LIBC_INLINE uint32_t load_outbox(uint64_t lane_mask, uint32_t index) const {
     return gpu::broadcast_value(lane_mask,
                                 outbox[index].load(cpp::MemoryOrder::RELAXED));
   }
@@ -349,13 +342,12 @@ struct Client {
   LIBC_INLINE Client &operator=(const Client &) = delete;
   LIBC_INLINE ~Client() = default;
 
+  LIBC_INLINE Client(uint32_t port_count, void *buffer)
+      : process(port_count, buffer) {}
+
   using Port = rpc::Port<false, Packet<gpu::LANE_SIZE>>;
   template <uint16_t opcode> LIBC_INLINE Port open();
 
-  LIBC_INLINE void reset(uint32_t port_count, void *buffer) {
-    process.reset(port_count, buffer);
-  }
-
 private:
   Process<false, Packet<gpu::LANE_SIZE>> process;
 };
@@ -371,18 +363,13 @@ template <uint32_t lane_size> struct Server {
   LIBC_INLINE Server &operator=(const Server &) = delete;
   LIBC_INLINE ~Server() = default;
 
+  LIBC_INLINE Server(uint32_t port_count, void *buffer)
+      : process(port_count, buffer) {}
+
   using Port = rpc::Port<true, Packet<lane_size>>;
   LIBC_INLINE cpp::optional<Port> try_open();
   LIBC_INLINE Port open();
 
-  LIBC_INLINE void reset(uint32_t port_count, void *buffer) {
-    process.reset(port_count, buffer);
-  }
-
-  LIBC_INLINE void *get_buffer_start() const {
-    return process.get_buffer_start();
-  }
-
   LIBC_INLINE static uint64_t allocation_size(uint32_t port_count) {
     return Process<true, Packet<lane_size>>::allocation_size(port_count);
   }

diff --git a/libc/src/gpu/CMakeLists.txt b/libc/src/gpu/CMakeLists.txt
@@ -1,14 +1,3 @@
-add_entrypoint_object(
-  rpc_reset
-  SRCS
-    rpc_reset.cpp
-  HDRS
-    rpc_reset.h
-  DEPENDS
-    libc.src.__support.RPC.rpc_client
-    libc.src.__support.GPU.utils
-)
-
 add_entrypoint_object(
   rpc_host_call
   SRCS

diff --git a/libc/src/gpu/rpc_reset.cpp b/libc/src/gpu/rpc_reset.cpp
diff --git a/libc/src/gpu/rpc_reset.h b/libc/src/gpu/rpc_reset.h
diff --git a/libc/startup/gpu/amdgpu/start.cpp b/libc/startup/gpu/amdgpu/start.cpp
@@ -44,12 +44,7 @@ static void call_fini_array_callbacks() {
 } // namespace __llvm_libc
 
 extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel]] void
-_begin(int argc, char **argv, char **env, void *rpc_shared_buffer) {
-  // We need to set up the RPC client first in case any of the constructors
-  // require it.
-  __llvm_libc::rpc::client.reset(__llvm_libc::rpc::MAX_PORT_COUNT,
-                                 rpc_shared_buffer);
-
+_begin(int argc, char **argv, char **env) {
   // We want the fini array callbacks to be run after other atexit
   // callbacks are run. So, we register them before running the init
   // array callbacks as they can potentially register their own atexit

diff --git a/libc/startup/gpu/nvptx/start.cpp b/libc/startup/gpu/nvptx/start.cpp
@@ -42,12 +42,7 @@ static void call_fini_array_callbacks() {
 } // namespace __llvm_libc
 
 extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void
-_begin(int argc, char **argv, char **env, void *rpc_shared_buffer) {
-  // We need to set up the RPC client first in case any of the constructors
-  // require it.
-  __llvm_libc::rpc::client.reset(__llvm_libc::rpc::MAX_PORT_COUNT,
-                                 rpc_shared_buffer);
-
+_begin(int argc, char **argv, char **env) {
   // We want the fini array callbacks to be run after other atexit
   // callbacks are run. So, we register them before running the init
   // array callbacks as they can potentially register their own atexit

diff --git a/libc/test/src/__support/RPC/rpc_smoke_test.cpp b/libc/test/src/__support/RPC/rpc_smoke_test.cpp
@@ -33,13 +33,8 @@ alignas(64) char buffer[alloc_size] = {0};
 
 TEST(LlvmLibcRPCSmoke, SanityCheck) {
 
-  ProcAType ProcA;
-  ProcBType ProcB;
-
-  ProcA.reset(port_count, buffer);
-  ProcB.reset(port_count, buffer);
-
-  EXPECT_EQ(ProcA.get_buffer_start(), ProcB.get_buffer_start());
+  ProcAType ProcA(port_count, buffer);
+  ProcBType ProcB(port_count, buffer);
 
   uint64_t index = 0; // any < port_count
   uint64_t lane_mask = 1;

diff --git a/libc/utils/gpu/loader/Loader.h b/libc/utils/gpu/loader/Loader.h
@@ -34,7 +34,6 @@ struct begin_args_t {
   int argc;
   void *argv;
   void *envp;
-  void *rpc_shared_buffer;
 };
 
 /// The arguments to the '_start' kernel.

diff --git a/libc/utils/gpu/loader/amdgpu/Loader.cpp b/libc/utils/gpu/loader/amdgpu/Loader.cpp
@@ -430,6 +430,49 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
   else
     handle_error("Invalid wavefront size");
 
+  // Initialize the RPC client on the device by copying the local data to the
+  // device's internal pointer.
+  hsa_executable_symbol_t rpc_client_sym;
+  if (hsa_status_t err = hsa_executable_get_symbol_by_name(
+          executable, rpc_client_symbol_name, &dev_agent, &rpc_client_sym))
+    handle_error(err);
+
+  void *rpc_client_host;
+  if (hsa_status_t err =
+          hsa_amd_memory_pool_allocate(coarsegrained_pool, sizeof(void *),
+                                       /*flags=*/0, &rpc_client_host))
+    handle_error(err);
+
+  void *rpc_client_dev;
+  if (hsa_status_t err = hsa_executable_symbol_get_info(
+          rpc_client_sym, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
+          &rpc_client_dev))
+    handle_error(err);
+
+  // Copy the address of the client buffer from the device to the host.
+  if (hsa_status_t err = hsa_memcpy(rpc_client_host, host_agent, rpc_client_dev,
+                                    dev_agent, sizeof(void *)))
+    handle_error(err);
+
+  void *rpc_client_buffer;
+  if (hsa_status_t err = hsa_amd_memory_pool_allocate(
+          coarsegrained_pool, rpc_get_client_size(),
+          /*flags=*/0, &rpc_client_buffer))
+    handle_error(err);
+  std::memcpy(rpc_client_buffer, rpc_get_client_buffer(device_id),
+              rpc_get_client_size());
+
+  // Copy the RPC client buffer to the address pointed to by the symbol.
+  if (hsa_status_t err =
+          hsa_memcpy(*reinterpret_cast<void **>(rpc_client_host), dev_agent,
+                     rpc_client_buffer, host_agent, rpc_get_client_size()))
+    handle_error(err);
+
+  if (hsa_status_t err = hsa_amd_memory_pool_free(rpc_client_buffer))
+    handle_error(err);
+  if (hsa_status_t err = hsa_amd_memory_pool_free(rpc_client_host))
+    handle_error(err);
+
   // Obtain the GPU's fixed-frequency clock rate and copy it to the GPU.
   // If the clock_freq symbol is missing, no work to do.
   hsa_executable_symbol_t freq_sym;
@@ -474,8 +517,7 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
     handle_error(err);
 
   LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
-  begin_args_t init_args = {argc, dev_argv, dev_envp,
-                            rpc_get_buffer(device_id)};
+  begin_args_t init_args = {argc, dev_argv, dev_envp};
   if (hsa_status_t err = launch_kernel(
           dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
           single_threaded_params, "_begin.kd", init_args))

diff --git a/libc/utils/gpu/loader/nvptx/Loader.cpp b/libc/utils/gpu/loader/nvptx/Loader.cpp
@@ -309,10 +309,25 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
                                          warp_size, rpc_alloc, nullptr))
     handle_error(err);
 
+  // Initialize the RPC client on the device by copying the local data to the
+  // device's internal pointer.
+  CUdeviceptr rpc_client_dev = 0;
+  uint64_t client_ptr_size = sizeof(void *);
+  if (CUresult err = cuModuleGetGlobal(&rpc_client_dev, &client_ptr_size,
+                                       binary, rpc_client_symbol_name))
+    handle_error(err);
+
+  CUdeviceptr rpc_client_host = 0;
+  if (CUresult err =
+          cuMemcpyDtoH(&rpc_client_host, rpc_client_dev, sizeof(void *)))
+    handle_error(err);
+  if (CUresult err =
+          cuMemcpyHtoD(rpc_client_host, rpc_get_client_buffer(device_id),
+                       rpc_get_client_size()))
+    handle_error(err);
+
   LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
-  // Call the kernel to
-  begin_args_t init_args = {argc, dev_argv, dev_envp,
-                            rpc_get_buffer(device_id)};
+  begin_args_t init_args = {argc, dev_argv, dev_envp};
   if (CUresult err = launch_kernel(binary, stream, single_threaded_params,
                                    "_begin", init_args))
     handle_error(err);