diff options
Diffstat (limited to 'src/__support/GPU/amdgpu/utils.h')
-rw-r--r-- | src/__support/GPU/amdgpu/utils.h | 55 |
1 files changed, 24 insertions, 31 deletions
diff --git a/src/__support/GPU/amdgpu/utils.h b/src/__support/GPU/amdgpu/utils.h index 9f0ff0c717a6..9b520a6bcf38 100644 --- a/src/__support/GPU/amdgpu/utils.h +++ b/src/__support/GPU/amdgpu/utils.h @@ -17,9 +17,6 @@ namespace LIBC_NAMESPACE { namespace gpu { -/// The number of threads that execute in lock-step in a lane. -constexpr const uint64_t LANE_SIZE = __AMDGCN_WAVEFRONT_SIZE; - /// Type aliases to the address spaces used by the AMDGPU backend. template <typename T> using Private = [[clang::opencl_private]] T; template <typename T> using Constant = [[clang::opencl_constant]] T; @@ -108,15 +105,15 @@ LIBC_INLINE uint64_t get_thread_id() { get_num_threads_x() * get_num_threads_y() * get_thread_id_z(); } -/// Returns the size of an AMD wavefront. Either 32 or 64 depending on hardware. -LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; } +/// Returns the size of an AMD wavefront, either 32 or 64 depending on hardware +/// and compilation options. +LIBC_INLINE uint32_t get_lane_size() { + return __builtin_amdgcn_wavefrontsize(); +} /// Returns the id of the thread inside of an AMD wavefront executing together. [[clang::convergent]] LIBC_INLINE uint32_t get_lane_id() { - if constexpr (LANE_SIZE == 64) - return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u)); - else - return __builtin_amdgcn_mbcnt_lo(~0u, 0u); + return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u)); } /// Returns the bit-mask of active threads in the current wavefront. @@ -134,11 +131,7 @@ LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; } [[clang::convergent]] LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) { // the lane_mask & gives the nvptx semantics when lane_mask is a subset of // the active threads - if constexpr (LANE_SIZE == 64) { - return lane_mask & __builtin_amdgcn_ballot_w64(x); - } else { - return lane_mask & __builtin_amdgcn_ballot_w32(x); - } + return lane_mask & __builtin_amdgcn_ballot_w64(x); } /// Waits for all the threads in the block to converge and issues a fence. @@ -152,33 +145,33 @@ LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; } __builtin_amdgcn_wave_barrier(); } -/// Returns the current value of the GPU's processor clock. -/// NOTE: The RDNA3 and RDNA2 architectures use a 20-bit cycle cycle counter. -LIBC_INLINE uint64_t processor_clock() { - if constexpr (LIBC_HAS_BUILTIN(__builtin_amdgcn_s_memtime)) - return __builtin_amdgcn_s_memtime(); - else if constexpr (LIBC_HAS_BUILTIN(__builtin_readcyclecounter)) - return __builtin_readcyclecounter(); - else - return 0; +/// Shuffles the the lanes inside the wavefront according to the given index. +[[clang::convergent]] LIBC_INLINE uint32_t shuffle(uint64_t, uint32_t idx, + uint32_t x) { + return __builtin_amdgcn_ds_bpermute(idx << 2, x); } +/// Returns the current value of the GPU's processor clock. +/// NOTE: The RDNA3 and RDNA2 architectures use a 20-bit cycle counter. +LIBC_INLINE uint64_t processor_clock() { return __builtin_readcyclecounter(); } + /// Returns a fixed-frequency timestamp. The actual frequency is dependent on /// the card and can only be queried via the driver. LIBC_INLINE uint64_t fixed_frequency_clock() { - if constexpr (LIBC_HAS_BUILTIN(__builtin_amdgcn_s_sendmsg_rtnl)) - return __builtin_amdgcn_s_sendmsg_rtnl(0x83); - else if constexpr (LIBC_HAS_BUILTIN(__builtin_amdgcn_s_memrealtime)) - return __builtin_amdgcn_s_memrealtime(); - else if constexpr (LIBC_HAS_BUILTIN(__builtin_amdgcn_s_memtime)) - return __builtin_amdgcn_s_memtime(); - else - return 0; + return __builtin_readsteadycounter(); } /// Terminates execution of the associated wavefront. [[noreturn]] LIBC_INLINE void end_program() { __builtin_amdgcn_endpgm(); } +/// Returns a unique identifier for the process cluster the current wavefront is +/// executing on. Here we use the identifier for the compute unit (CU) and +/// shader engine. +/// FIXME: Currently unimplemented on AMDGPU until we have a simpler interface +/// than the one at +/// https://github.com/ROCm/clr/blob/develop/hipamd/include/hip/amd_detail/amd_device_functions.h#L899 +LIBC_INLINE uint32_t get_cluster_id() { return 0; } + } // namespace gpu } // namespace LIBC_NAMESPACE |