Joseph Huber 507edb52f9 [libc] Enable multiple threads to use RPC on the GPU
The execution model of the GPU expects that groups of threads will
execute in lock-step in SIMD fashion. It's both important for
performance and correctness that we treat this as the smallest possible
granularity for an RPC operation. Thus, we map multiple threads to a
single larger buffer and ship that across the wire.

This patch makes the necessary changes to support executing the RPC on
the GPU with multiple threads. This requires some workarounds to mimic
the model when handling the protocol from the CPU. I'm not completely
happy with some of the workarounds required, but I think it should work.

Uses some of the implementation details from D148191.

Reviewed By: JonChesterfield

Differential Revision: https://reviews.llvm.org/D148943
2023-05-04 19:31:41 -05:00

75 lines
2.5 KiB
C++

//===-- Generic device loader interface -----------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIBC_UTILS_GPU_LOADER_LOADER_H
#define LLVM_LIBC_UTILS_GPU_LOADER_LOADER_H
#include <cstdint>
#include <cstring>
#include <stddef.h>
/// Generic launch parameters for configuration the number of blocks / threads.
struct LaunchParameters {
uint32_t num_threads_x;
uint32_t num_threads_y;
uint32_t num_threads_z;
uint32_t num_blocks_x;
uint32_t num_blocks_y;
uint32_t num_blocks_z;
};
/// Generic interface to load the \p image and launch execution of the _start
/// kernel on the target device. Copies \p argc and \p argv to the device.
/// Returns the final value of the `main` function on the device.
int load(int argc, char **argv, char **evnp, void *image, size_t size,
const LaunchParameters &params);
/// Return \p V aligned "upwards" according to \p Align.
template <typename V, typename A> inline V align_up(V val, A align) {
return ((val + V(align) - 1) / V(align)) * V(align);
}
/// Copy the system's argument vector to GPU memory allocated using \p alloc.
template <typename Allocator>
void *copy_argument_vector(int argc, char **argv, Allocator alloc) {
size_t argv_size = sizeof(char *) * (argc + 1);
size_t str_size = 0;
for (int i = 0; i < argc; ++i)
str_size += strlen(argv[i]) + 1;
// We allocate enough space for a null terminated array and all the strings.
void *dev_argv = alloc(argv_size + str_size);
if (!dev_argv)
return nullptr;
// Store the strings linerally in the same memory buffer.
void *dev_str = reinterpret_cast<uint8_t *>(dev_argv) + argv_size;
for (int i = 0; i < argc; ++i) {
size_t size = strlen(argv[i]) + 1;
std::memcpy(dev_str, argv[i], size);
static_cast<void **>(dev_argv)[i] = dev_str;
dev_str = reinterpret_cast<uint8_t *>(dev_str) + size;
}
// Ensure the vector is null terminated.
reinterpret_cast<void **>(dev_argv)[argv_size] = nullptr;
return dev_argv;
};
/// Copy the system's environment to GPU memory allocated using \p alloc.
template <typename Allocator>
void *copy_environment(char **envp, Allocator alloc) {
int envc = 0;
for (char **env = envp; *env != 0; ++env)
++envc;
return copy_argument_vector(envc, envp, alloc);
};
#endif