Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion libc/config/gpu/entrypoints.txt
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,6 @@ set(TARGET_LIBC_ENTRYPOINTS
libc.src.time.nanosleep

# gpu/rpc.h entrypoints
libc.src.gpu.rpc_reset
libc.src.gpu.rpc_host_call
)

Expand Down
5 changes: 0 additions & 5 deletions libc/spec/gpu_ext.td
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,6 @@ def GPUExtensions : StandardSpec<"GPUExtensions"> {
[], // Types
[], // Enumerations
[
FunctionSpec<
"rpc_reset",
RetValSpec<VoidType>,
[ArgSpec<UnsignedIntType>, ArgSpec<VoidPtr>]
>,
FunctionSpec<
"rpc_host_call",
RetValSpec<VoidType>,
Expand Down
43 changes: 15 additions & 28 deletions libc/src/__support/RPC/rpc.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,20 +88,13 @@ template <bool Invert, typename Packet> struct Process {
static constexpr uint64_t NUM_BITS_IN_WORD = sizeof(uint32_t) * 8;
cpp::Atomic<uint32_t> lock[MAX_PORT_COUNT / NUM_BITS_IN_WORD] = {0};

/// Initialize the communication channels.
LIBC_INLINE void reset(uint32_t port_count, void *buffer) {
this->port_count = port_count;
this->inbox = reinterpret_cast<cpp::Atomic<uint32_t> *>(
advance(buffer, inbox_offset(port_count)));
this->outbox = reinterpret_cast<cpp::Atomic<uint32_t> *>(
advance(buffer, outbox_offset(port_count)));
this->packet =
reinterpret_cast<Packet *>(advance(buffer, buffer_offset(port_count)));
}

/// Returns the beginning of the unified buffer. Intended for initializing the
/// client after the server has been started.
LIBC_INLINE void *get_buffer_start() const { return Invert ? outbox : inbox; }
LIBC_INLINE Process(uint32_t port_count, void *buffer)
: port_count(port_count), inbox(reinterpret_cast<cpp::Atomic<uint32_t> *>(
advance(buffer, inbox_offset(port_count)))),
outbox(reinterpret_cast<cpp::Atomic<uint32_t> *>(
advance(buffer, outbox_offset(port_count)))),
packet(reinterpret_cast<Packet *>(
advance(buffer, buffer_offset(port_count)))) {}

/// Allocate a memory buffer sufficient to store the following equivalent
/// representation in memory.
Expand All @@ -116,13 +109,13 @@ template <bool Invert, typename Packet> struct Process {
}

/// Retrieve the inbox state from memory shared between processes.
LIBC_INLINE uint32_t load_inbox(uint64_t lane_mask, uint32_t index) {
LIBC_INLINE uint32_t load_inbox(uint64_t lane_mask, uint32_t index) const {
return gpu::broadcast_value(lane_mask,
inbox[index].load(cpp::MemoryOrder::RELAXED));
}

/// Retrieve the outbox state from memory shared between processes.
LIBC_INLINE uint32_t load_outbox(uint64_t lane_mask, uint32_t index) {
LIBC_INLINE uint32_t load_outbox(uint64_t lane_mask, uint32_t index) const {
return gpu::broadcast_value(lane_mask,
outbox[index].load(cpp::MemoryOrder::RELAXED));
}
Expand Down Expand Up @@ -349,13 +342,12 @@ struct Client {
LIBC_INLINE Client &operator=(const Client &) = delete;
LIBC_INLINE ~Client() = default;

LIBC_INLINE Client(uint32_t port_count, void *buffer)
: process(port_count, buffer) {}

using Port = rpc::Port<false, Packet<gpu::LANE_SIZE>>;
template <uint16_t opcode> LIBC_INLINE Port open();

LIBC_INLINE void reset(uint32_t port_count, void *buffer) {
process.reset(port_count, buffer);
}

private:
Process<false, Packet<gpu::LANE_SIZE>> process;
};
Expand All @@ -371,18 +363,13 @@ template <uint32_t lane_size> struct Server {
LIBC_INLINE Server &operator=(const Server &) = delete;
LIBC_INLINE ~Server() = default;

LIBC_INLINE Server(uint32_t port_count, void *buffer)
: process(port_count, buffer) {}

using Port = rpc::Port<true, Packet<lane_size>>;
LIBC_INLINE cpp::optional<Port> try_open();
LIBC_INLINE Port open();

LIBC_INLINE void reset(uint32_t port_count, void *buffer) {
process.reset(port_count, buffer);
}

LIBC_INLINE void *get_buffer_start() const {
return process.get_buffer_start();
}

LIBC_INLINE static uint64_t allocation_size(uint32_t port_count) {
return Process<true, Packet<lane_size>>::allocation_size(port_count);
}
Expand Down
11 changes: 0 additions & 11 deletions libc/src/gpu/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,14 +1,3 @@
add_entrypoint_object(
rpc_reset
SRCS
rpc_reset.cpp
HDRS
rpc_reset.h
DEPENDS
libc.src.__support.RPC.rpc_client
libc.src.__support.GPU.utils
)

add_entrypoint_object(
rpc_host_call
SRCS
Expand Down
24 changes: 0 additions & 24 deletions libc/src/gpu/rpc_reset.cpp

This file was deleted.

18 changes: 0 additions & 18 deletions libc/src/gpu/rpc_reset.h

This file was deleted.

7 changes: 1 addition & 6 deletions libc/startup/gpu/amdgpu/start.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,7 @@ static void call_fini_array_callbacks() {
} // namespace __llvm_libc

extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel]] void
_begin(int argc, char **argv, char **env, void *rpc_shared_buffer) {
// We need to set up the RPC client first in case any of the constructors
// require it.
__llvm_libc::rpc::client.reset(__llvm_libc::rpc::MAX_PORT_COUNT,
rpc_shared_buffer);

_begin(int argc, char **argv, char **env) {
// We want the fini array callbacks to be run after other atexit
// callbacks are run. So, we register them before running the init
// array callbacks as they can potentially register their own atexit
Expand Down
7 changes: 1 addition & 6 deletions libc/startup/gpu/nvptx/start.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,7 @@ static void call_fini_array_callbacks() {
} // namespace __llvm_libc

extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void
_begin(int argc, char **argv, char **env, void *rpc_shared_buffer) {
// We need to set up the RPC client first in case any of the constructors
// require it.
__llvm_libc::rpc::client.reset(__llvm_libc::rpc::MAX_PORT_COUNT,
rpc_shared_buffer);

_begin(int argc, char **argv, char **env) {
// We want the fini array callbacks to be run after other atexit
// callbacks are run. So, we register them before running the init
// array callbacks as they can potentially register their own atexit
Expand Down
9 changes: 2 additions & 7 deletions libc/test/src/__support/RPC/rpc_smoke_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,8 @@ alignas(64) char buffer[alloc_size] = {0};

TEST(LlvmLibcRPCSmoke, SanityCheck) {

ProcAType ProcA;
ProcBType ProcB;

ProcA.reset(port_count, buffer);
ProcB.reset(port_count, buffer);

EXPECT_EQ(ProcA.get_buffer_start(), ProcB.get_buffer_start());
ProcAType ProcA(port_count, buffer);
ProcBType ProcB(port_count, buffer);

uint64_t index = 0; // any < port_count
uint64_t lane_mask = 1;
Expand Down
1 change: 0 additions & 1 deletion libc/utils/gpu/loader/Loader.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ struct begin_args_t {
int argc;
void *argv;
void *envp;
void *rpc_shared_buffer;
};

/// The arguments to the '_start' kernel.
Expand Down
46 changes: 44 additions & 2 deletions libc/utils/gpu/loader/amdgpu/Loader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -430,6 +430,49 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
else
handle_error("Invalid wavefront size");

// Initialize the RPC client on the device by copying the local data to the
// device's internal pointer.
hsa_executable_symbol_t rpc_client_sym;
if (hsa_status_t err = hsa_executable_get_symbol_by_name(
executable, rpc_client_symbol_name, &dev_agent, &rpc_client_sym))
handle_error(err);

void *rpc_client_host;
if (hsa_status_t err =
hsa_amd_memory_pool_allocate(coarsegrained_pool, sizeof(void *),
/*flags=*/0, &rpc_client_host))
handle_error(err);

void *rpc_client_dev;
if (hsa_status_t err = hsa_executable_symbol_get_info(
rpc_client_sym, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
&rpc_client_dev))
handle_error(err);

// Copy the address of the client buffer from the device to the host.
if (hsa_status_t err = hsa_memcpy(rpc_client_host, host_agent, rpc_client_dev,
dev_agent, sizeof(void *)))
handle_error(err);

void *rpc_client_buffer;
if (hsa_status_t err = hsa_amd_memory_pool_allocate(
coarsegrained_pool, rpc_get_client_size(),
/*flags=*/0, &rpc_client_buffer))
handle_error(err);
std::memcpy(rpc_client_buffer, rpc_get_client_buffer(device_id),
rpc_get_client_size());

// Copy the RPC client buffer to the address pointed to by the symbol.
if (hsa_status_t err =
hsa_memcpy(*reinterpret_cast<void **>(rpc_client_host), dev_agent,
rpc_client_buffer, host_agent, rpc_get_client_size()))
handle_error(err);

if (hsa_status_t err = hsa_amd_memory_pool_free(rpc_client_buffer))
handle_error(err);
if (hsa_status_t err = hsa_amd_memory_pool_free(rpc_client_host))
handle_error(err);

// Obtain the GPU's fixed-frequency clock rate and copy it to the GPU.
// If the clock_freq symbol is missing, no work to do.
hsa_executable_symbol_t freq_sym;
Expand Down Expand Up @@ -474,8 +517,7 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
handle_error(err);

LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
begin_args_t init_args = {argc, dev_argv, dev_envp,
rpc_get_buffer(device_id)};
begin_args_t init_args = {argc, dev_argv, dev_envp};
if (hsa_status_t err = launch_kernel(
dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
single_threaded_params, "_begin.kd", init_args))
Expand Down
21 changes: 18 additions & 3 deletions libc/utils/gpu/loader/nvptx/Loader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -309,10 +309,25 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
warp_size, rpc_alloc, nullptr))
handle_error(err);

// Initialize the RPC client on the device by copying the local data to the
// device's internal pointer.
CUdeviceptr rpc_client_dev = 0;
uint64_t client_ptr_size = sizeof(void *);
if (CUresult err = cuModuleGetGlobal(&rpc_client_dev, &client_ptr_size,
binary, rpc_client_symbol_name))
handle_error(err);

CUdeviceptr rpc_client_host = 0;
if (CUresult err =
cuMemcpyDtoH(&rpc_client_host, rpc_client_dev, sizeof(void *)))
handle_error(err);
if (CUresult err =
cuMemcpyHtoD(rpc_client_host, rpc_get_client_buffer(device_id),
rpc_get_client_size()))
handle_error(err);

LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
// Call the kernel to
begin_args_t init_args = {argc, dev_argv, dev_envp,
rpc_get_buffer(device_id)};
begin_args_t init_args = {argc, dev_argv, dev_envp};
if (CUresult err = launch_kernel(binary, stream, single_threaded_params,
"_begin", init_args))
handle_error(err);
Expand Down
Loading