1 files changed, 24 insertions, 31 deletions
diff --git a/src/__support/GPU/amdgpu/utils.h b/src/__support/GPU/amdgpu/utils.h
index 9f0ff0c717a6..9b520a6bcf38 100644
--- a/src/__support/GPU/amdgpu/utils.h
+++ b/src/__support/GPU/amdgpu/utils.h
@@ -17,9 +17,6 @@
 namespace LIBC_NAMESPACE {
 namespace gpu {
 
-/// The number of threads that execute in lock-step in a lane.
-constexpr const uint64_t LANE_SIZE = __AMDGCN_WAVEFRONT_SIZE;
-
 /// Type aliases to the address spaces used by the AMDGPU backend.
 template <typename T> using Private = [[clang::opencl_private]] T;
 template <typename T> using Constant = [[clang::opencl_constant]] T;
@@ -108,15 +105,15 @@ LIBC_INLINE uint64_t get_thread_id() {
          get_num_threads_x() * get_num_threads_y() * get_thread_id_z();
 }
 
-/// Returns the size of an AMD wavefront. Either 32 or 64 depending on hardware.
-LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }
+/// Returns the size of an AMD wavefront, either 32 or 64 depending on hardware
+/// and compilation options.
+LIBC_INLINE uint32_t get_lane_size() {
+  return __builtin_amdgcn_wavefrontsize();
+}
 
 /// Returns the id of the thread inside of an AMD wavefront executing together.
 [[clang::convergent]] LIBC_INLINE uint32_t get_lane_id() {
-  if constexpr (LANE_SIZE == 64)
-    return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
-  else
-    return __builtin_amdgcn_mbcnt_lo(~0u, 0u);
+  return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
 }
 
 /// Returns the bit-mask of active threads in the current wavefront.
@@ -134,11 +131,7 @@ LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }
 [[clang::convergent]] LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
   // the lane_mask & gives the nvptx semantics when lane_mask is a subset of
   // the active threads
-  if constexpr (LANE_SIZE == 64) {
-    return lane_mask & __builtin_amdgcn_ballot_w64(x);
-  } else {
-    return lane_mask & __builtin_amdgcn_ballot_w32(x);
-  }
+  return lane_mask & __builtin_amdgcn_ballot_w64(x);
 }
 
 /// Waits for all the threads in the block to converge and issues a fence.
@@ -152,33 +145,33 @@ LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }
   __builtin_amdgcn_wave_barrier();
 }
 
-/// Returns the current value of the GPU's processor clock.
-/// NOTE: The RDNA3 and RDNA2 architectures use a 20-bit cycle cycle counter.
-LIBC_INLINE uint64_t processor_clock() {
-  if constexpr (LIBC_HAS_BUILTIN(__builtin_amdgcn_s_memtime))
-    return __builtin_amdgcn_s_memtime();
-  else if constexpr (LIBC_HAS_BUILTIN(__builtin_readcyclecounter))
-    return __builtin_readcyclecounter();
-  else
-    return 0;
+/// Shuffles the the lanes inside the wavefront according to the given index.
+[[clang::convergent]] LIBC_INLINE uint32_t shuffle(uint64_t, uint32_t idx,
+                                                   uint32_t x) {
+  return __builtin_amdgcn_ds_bpermute(idx << 2, x);
 }
 
+/// Returns the current value of the GPU's processor clock.
+/// NOTE: The RDNA3 and RDNA2 architectures use a 20-bit cycle counter.
+LIBC_INLINE uint64_t processor_clock() { return __builtin_readcyclecounter(); }
+
 /// Returns a fixed-frequency timestamp. The actual frequency is dependent on
 /// the card and can only be queried via the driver.
 LIBC_INLINE uint64_t fixed_frequency_clock() {
-  if constexpr (LIBC_HAS_BUILTIN(__builtin_amdgcn_s_sendmsg_rtnl))
-    return __builtin_amdgcn_s_sendmsg_rtnl(0x83);
-  else if constexpr (LIBC_HAS_BUILTIN(__builtin_amdgcn_s_memrealtime))
-    return __builtin_amdgcn_s_memrealtime();
-  else if constexpr (LIBC_HAS_BUILTIN(__builtin_amdgcn_s_memtime))
-    return __builtin_amdgcn_s_memtime();
-  else
-    return 0;
+  return __builtin_readsteadycounter();
 }
 
 /// Terminates execution of the associated wavefront.
 [[noreturn]] LIBC_INLINE void end_program() { __builtin_amdgcn_endpgm(); }
 
+/// Returns a unique identifier for the process cluster the current wavefront is
+/// executing on. Here we use the identifier for the compute unit (CU) and
+/// shader engine.
+/// FIXME: Currently unimplemented on AMDGPU until we have a simpler interface
+/// than the one at
+/// https://github.com/ROCm/clr/blob/develop/hipamd/include/hip/amd_detail/amd_device_functions.h#L899
+LIBC_INLINE uint32_t get_cluster_id() { return 0; }
+
 } // namespace gpu
 } // namespace LIBC_NAMESPACE