summaryrefslogtreecommitdiff
path: root/src/__support/GPU/amdgpu/utils.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/__support/GPU/amdgpu/utils.h')
-rw-r--r--src/__support/GPU/amdgpu/utils.h55
1 files changed, 24 insertions, 31 deletions
diff --git a/src/__support/GPU/amdgpu/utils.h b/src/__support/GPU/amdgpu/utils.h
index 9f0ff0c717a6..9b520a6bcf38 100644
--- a/src/__support/GPU/amdgpu/utils.h
+++ b/src/__support/GPU/amdgpu/utils.h
@@ -17,9 +17,6 @@
namespace LIBC_NAMESPACE {
namespace gpu {
-/// The number of threads that execute in lock-step in a lane.
-constexpr const uint64_t LANE_SIZE = __AMDGCN_WAVEFRONT_SIZE;
-
/// Type aliases to the address spaces used by the AMDGPU backend.
template <typename T> using Private = [[clang::opencl_private]] T;
template <typename T> using Constant = [[clang::opencl_constant]] T;
@@ -108,15 +105,15 @@ LIBC_INLINE uint64_t get_thread_id() {
get_num_threads_x() * get_num_threads_y() * get_thread_id_z();
}
-/// Returns the size of an AMD wavefront. Either 32 or 64 depending on hardware.
-LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }
+/// Returns the size of an AMD wavefront, either 32 or 64 depending on hardware
+/// and compilation options.
+LIBC_INLINE uint32_t get_lane_size() {
+ return __builtin_amdgcn_wavefrontsize();
+}
/// Returns the id of the thread inside of an AMD wavefront executing together.
[[clang::convergent]] LIBC_INLINE uint32_t get_lane_id() {
- if constexpr (LANE_SIZE == 64)
- return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
- else
- return __builtin_amdgcn_mbcnt_lo(~0u, 0u);
+ return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
}
/// Returns the bit-mask of active threads in the current wavefront.
@@ -134,11 +131,7 @@ LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }
[[clang::convergent]] LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
// the lane_mask & gives the nvptx semantics when lane_mask is a subset of
// the active threads
- if constexpr (LANE_SIZE == 64) {
- return lane_mask & __builtin_amdgcn_ballot_w64(x);
- } else {
- return lane_mask & __builtin_amdgcn_ballot_w32(x);
- }
+ return lane_mask & __builtin_amdgcn_ballot_w64(x);
}
/// Waits for all the threads in the block to converge and issues a fence.
@@ -152,33 +145,33 @@ LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }
__builtin_amdgcn_wave_barrier();
}
-/// Returns the current value of the GPU's processor clock.
-/// NOTE: The RDNA3 and RDNA2 architectures use a 20-bit cycle cycle counter.
-LIBC_INLINE uint64_t processor_clock() {
- if constexpr (LIBC_HAS_BUILTIN(__builtin_amdgcn_s_memtime))
- return __builtin_amdgcn_s_memtime();
- else if constexpr (LIBC_HAS_BUILTIN(__builtin_readcyclecounter))
- return __builtin_readcyclecounter();
- else
- return 0;
+/// Shuffles the the lanes inside the wavefront according to the given index.
+[[clang::convergent]] LIBC_INLINE uint32_t shuffle(uint64_t, uint32_t idx,
+ uint32_t x) {
+ return __builtin_amdgcn_ds_bpermute(idx << 2, x);
}
+/// Returns the current value of the GPU's processor clock.
+/// NOTE: The RDNA3 and RDNA2 architectures use a 20-bit cycle counter.
+LIBC_INLINE uint64_t processor_clock() { return __builtin_readcyclecounter(); }
+
/// Returns a fixed-frequency timestamp. The actual frequency is dependent on
/// the card and can only be queried via the driver.
LIBC_INLINE uint64_t fixed_frequency_clock() {
- if constexpr (LIBC_HAS_BUILTIN(__builtin_amdgcn_s_sendmsg_rtnl))
- return __builtin_amdgcn_s_sendmsg_rtnl(0x83);
- else if constexpr (LIBC_HAS_BUILTIN(__builtin_amdgcn_s_memrealtime))
- return __builtin_amdgcn_s_memrealtime();
- else if constexpr (LIBC_HAS_BUILTIN(__builtin_amdgcn_s_memtime))
- return __builtin_amdgcn_s_memtime();
- else
- return 0;
+ return __builtin_readsteadycounter();
}
/// Terminates execution of the associated wavefront.
[[noreturn]] LIBC_INLINE void end_program() { __builtin_amdgcn_endpgm(); }
+/// Returns a unique identifier for the process cluster the current wavefront is
+/// executing on. Here we use the identifier for the compute unit (CU) and
+/// shader engine.
+/// FIXME: Currently unimplemented on AMDGPU until we have a simpler interface
+/// than the one at
+/// https://github.com/ROCm/clr/blob/develop/hipamd/include/hip/amd_detail/amd_device_functions.h#L899
+LIBC_INLINE uint32_t get_cluster_id() { return 0; }
+
} // namespace gpu
} // namespace LIBC_NAMESPACE