//===-- Shared memory RPC client / server utilities -------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #ifndef LLVM_LIBC_SHARED_RPC_UTIL_H #define LLVM_LIBC_SHARED_RPC_UTIL_H #include #include #if (defined(__NVPTX__) || defined(__AMDGPU__)) && \ !((defined(__CUDA__) && !defined(__CUDA_ARCH__)) || \ (defined(__HIP__) && !defined(__HIP_DEVICE_COMPILE__))) #include #define RPC_TARGET_IS_GPU #endif // Workaround for missing __has_builtin in < GCC 10. #ifndef __has_builtin #define __has_builtin(x) 0 #endif #ifndef RPC_ATTRS #if defined(__CUDA__) || defined(__HIP__) #define RPC_ATTRS __attribute__((host, device)) inline #else #define RPC_ATTRS inline #endif #endif namespace rpc { template struct type_identity { using type = T; }; template struct type_constant { static inline constexpr T value = v; }; template struct remove_reference : type_identity {}; template struct remove_reference : type_identity {}; template struct remove_reference : type_identity {}; template struct is_const : type_constant {}; template struct is_const : type_constant {}; /// Freestanding implementation of std::move. template RPC_ATTRS constexpr typename remove_reference::type &&move(T &&t) { return static_cast::type &&>(t); } /// Freestanding implementation of std::forward. template RPC_ATTRS constexpr T &&forward(typename remove_reference::type &value) { return static_cast(value); } template RPC_ATTRS constexpr T &&forward(typename remove_reference::type &&value) { return static_cast(value); } struct in_place_t { RPC_ATTRS explicit in_place_t() = default; }; struct nullopt_t { RPC_ATTRS constexpr explicit nullopt_t() = default; }; constexpr inline in_place_t in_place{}; constexpr inline nullopt_t nullopt{}; /// Freestanding and minimal implementation of std::optional. template class optional { template struct OptionalStorage { union { char empty; U stored_value; }; bool in_use = false; RPC_ATTRS ~OptionalStorage() { reset(); } RPC_ATTRS constexpr OptionalStorage() : empty() {} template RPC_ATTRS constexpr explicit OptionalStorage(in_place_t, Args &&...args) : stored_value(forward(args)...) {} RPC_ATTRS constexpr void reset() { if (in_use) stored_value.~U(); in_use = false; } }; OptionalStorage storage; public: RPC_ATTRS constexpr optional() = default; RPC_ATTRS constexpr optional(nullopt_t) {} RPC_ATTRS constexpr optional(const T &t) : storage(in_place, t) { storage.in_use = true; } RPC_ATTRS constexpr optional(const optional &) = default; RPC_ATTRS constexpr optional(T &&t) : storage(in_place, move(t)) { storage.in_use = true; } RPC_ATTRS constexpr optional(optional &&O) = default; RPC_ATTRS constexpr optional &operator=(T &&t) { storage = move(t); return *this; } RPC_ATTRS constexpr optional &operator=(optional &&) = default; RPC_ATTRS constexpr optional &operator=(const T &t) { storage = t; return *this; } RPC_ATTRS constexpr optional &operator=(const optional &) = default; RPC_ATTRS constexpr void reset() { storage.reset(); } RPC_ATTRS constexpr const T &value() const & { return storage.stored_value; } RPC_ATTRS constexpr T &value() & { return storage.stored_value; } RPC_ATTRS constexpr explicit operator bool() const { return storage.in_use; } RPC_ATTRS constexpr bool has_value() const { return storage.in_use; } RPC_ATTRS constexpr const T *operator->() const { return &storage.stored_value; } RPC_ATTRS constexpr T *operator->() { return &storage.stored_value; } RPC_ATTRS constexpr const T &operator*() const & { return storage.stored_value; } RPC_ATTRS constexpr T &operator*() & { return storage.stored_value; } RPC_ATTRS constexpr T &&value() && { return move(storage.stored_value); } RPC_ATTRS constexpr T &&operator*() && { return move(storage.stored_value); } }; /// Suspend the thread briefly to assist the thread scheduler during busy loops. RPC_ATTRS void sleep_briefly() { #if __has_builtin(__nvvm_reflect) if (__nvvm_reflect("__CUDA_ARCH") >= 700) asm("nanosleep.u32 64;" ::: "memory"); #elif __has_builtin(__builtin_amdgcn_s_sleep) __builtin_amdgcn_s_sleep(2); #elif __has_builtin(__builtin_ia32_pause) __builtin_ia32_pause(); #elif __has_builtin(__builtin_arm_isb) __builtin_arm_isb(0xf); #else // Simply do nothing if sleeping isn't supported on this platform. #endif } /// Conditional to indicate if this process is running on the GPU. RPC_ATTRS constexpr bool is_process_gpu() { #ifdef RPC_TARGET_IS_GPU return true; #else return false; #endif } /// Wait for all lanes in the group to complete. RPC_ATTRS void sync_lane([[maybe_unused]] uint64_t lane_mask) { #ifdef RPC_TARGET_IS_GPU return __gpu_sync_lane(lane_mask); #endif } /// Copies the value from the first active thread to the rest. RPC_ATTRS uint32_t broadcast_value([[maybe_unused]] uint64_t lane_mask, uint32_t x) { #ifdef RPC_TARGET_IS_GPU return __gpu_read_first_lane_u32(lane_mask, x); #else return x; #endif } /// Returns the number lanes that participate in the RPC interface. RPC_ATTRS uint32_t get_num_lanes() { #ifdef RPC_TARGET_IS_GPU return __gpu_num_lanes(); #else return 1; #endif } /// Returns the id of the thread inside of an AMD wavefront executing together. RPC_ATTRS uint64_t get_lane_mask() { #ifdef RPC_TARGET_IS_GPU return __gpu_lane_mask(); #else return 1; #endif } /// Returns the id of the thread inside of an AMD wavefront executing together. RPC_ATTRS uint32_t get_lane_id() { #ifdef RPC_TARGET_IS_GPU return __gpu_lane_id(); #else return 0; #endif } /// Conditional that is only true for a single thread in a lane. RPC_ATTRS bool is_first_lane([[maybe_unused]] uint64_t lane_mask) { #ifdef RPC_TARGET_IS_GPU return __gpu_is_first_in_lane(lane_mask); #else return true; #endif } /// Returns a bitmask of threads in the current lane for which \p x is true. RPC_ATTRS uint64_t ballot([[maybe_unused]] uint64_t lane_mask, bool x) { #ifdef RPC_TARGET_IS_GPU return __gpu_ballot(lane_mask, x); #else return x; #endif } /// Return \p val aligned "upwards" according to \p align. template RPC_ATTRS constexpr V align_up(V val, A align) { return ((val + V(align) - 1) / V(align)) * V(align); } /// Utility to provide a unified interface between the CPU and GPU's memory /// model. On the GPU stack variables are always private to a lane so we can /// simply use the variable passed in. On the CPU we need to allocate enough /// space for the whole lane and index into it. template RPC_ATTRS V &lane_value(V *val, uint32_t id) { if constexpr (is_process_gpu()) return *val; return val[id]; } /// Advance the \p p by \p bytes. template RPC_ATTRS T *advance(T *ptr, U bytes) { if constexpr (is_const::value) return reinterpret_cast(reinterpret_cast(ptr) + bytes); else return reinterpret_cast(reinterpret_cast(ptr) + bytes); } /// Wrapper around the optimal memory copy implementation for the target. RPC_ATTRS void rpc_memcpy(void *dst, const void *src, size_t count) { __builtin_memcpy(dst, src, count); } template RPC_ATTRS constexpr const T &max(const T &a, const T &b) { return (a < b) ? b : a; } } // namespace rpc #endif // LLVM_LIBC_SHARED_RPC_UTIL_H