amd_buffer_addressing.hpp File Reference#
amd_buffer_addressing.hpp File Reference
#include "ck_tile/core/config.hpp"#include "ck_tile/core/numeric/integer.hpp"#include "ck_tile/core/numeric/integral_constant.hpp"#include "ck_tile/core/numeric/vector_type.hpp"#include "ck_tile/core/container/container_helper.hpp"#include "ck_tile/core/container/thread_buffer.hpp"#include "ck_tile/core/utility/type_traits.hpp"#include "ck_tile/core/utility/bit_cast.hpp"#include "ck_tile/core/utility/functional.hpp"#include "ck_tile/core/utility/ignore.hpp"Go to the source code of this file.
Namespaces | |
| namespace | ck_tile |
| namespace | ck_tile::impl |
Macros | |
| #define | LIKELY(x) |
| #define | HAS_RAW_BUFFER_BUILTINS |
| #define | CK_TILE_ASYNC_LOAD_WITH_INSTR(instr) |
| #define | BUFFER_LOAD_USE_INLINEASM 0 |
Typedefs | |
| using | as3_uint32_ptr = uint32_t * |
Enumerations | |
| enum struct | ck_tile::amd_buffer_coherence_enum { ck_tile::coherence_default = 0 , ck_tile::glc = 1 , ck_tile::slc = 2 , ck_tile::glc_slc = 3 , ck_tile::WAVE_NT0 = 0 , ck_tile::WAVE_NT1 = 2 , ck_tile::GROUP_NT0 = 1 , ck_tile::GROUP_NT1 = 3 , ck_tile::DEVICE_NT0 = 8 , ck_tile::DEVICE_NT1 = 10 , ck_tile::SYSTEM_NT0 = 9 , ck_tile::SYSTEM_NT1 = 11 } |
Functions | |
| __device__ uint32_t | ck_tile::amd_wave_read_first_lane (uint16_t v) |
| __device__ uint32_t | ck_tile::amd_wave_read_first_lane (uint8_t v) |
| __device__ uint32_t | ck_tile::amd_wave_read_first_lane (uint32_t value) |
| __device__ int32_t | ck_tile::amd_wave_read_first_lane (int32_t value) |
| template<typename Object, std::enable_if_t< std::is_trivially_copyable_v< Object >, int > = 0> | |
| __device__ auto | ck_tile::amd_wave_read_first_lane (const Object &obj) |
| template<typename ForceSGPR = std::false_type> | |
| CK_TILE_DEVICE int32x4_t | ck_tile::make_wave_buffer_resource (const void *ptr, uint32_t size=0xffffffff, ForceSGPR={}) |
| CK_TILE_DEVICE void | ck_tile::buffer_load_fence (index_t cnt=0) |
| CK_TILE_DEVICE void | ck_tile::lds_load_fence (index_t cnt=0) |
| template<index_t N> | |
| CK_TILE_DEVICE void | ck_tile::impl::insert_dummy_dep_per_dword (array< float, N > &b) |
| template<> | |
| CK_TILE_DEVICE void | ck_tile::impl::insert_dummy_dep_per_dword< 2 > (array< float, 2 > &b) |
| template<> | |
| CK_TILE_DEVICE void | ck_tile::impl::insert_dummy_dep_per_dword< 3 > (array< float, 3 > &b) |
| template<> | |
| CK_TILE_DEVICE void | ck_tile::impl::insert_dummy_dep_per_dword< 4 > (array< float, 4 > &b) |
| template<> | |
| CK_TILE_DEVICE void | ck_tile::impl::insert_dummy_dep_per_dword< 8 > (array< float, 8 > &b) |
| template<> | |
| CK_TILE_DEVICE void | ck_tile::impl::insert_dummy_dep_per_dword< 16 > (array< float, 16 > &b) |
| template<> | |
| CK_TILE_DEVICE void | ck_tile::impl::insert_dummy_dep_per_dword< 32 > (array< float, 32 > &b) |
| CK_TILE_DEVICE void | ck_tile::impl::insert_dummy_dep () |
| template<typename T> | |
| CK_TILE_DEVICE void | ck_tile::impl::insert_dummy_dep (T &buffer) |
| template<typename Tx, typename... Ty> | |
| CK_TILE_DEVICE void | ck_tile::impl::insert_dummy_dep (Tx &bx, Ty &... by) |
| template<typename... T> | |
| CK_TILE_DEVICE void | ck_tile::buffer_load_fence (index_t cnt=0, T &... o) |
| CK_TILE_DEVICE void | ck_tile::buffer_store_fence (index_t cnt=0) |
| CK_TILE_DEVICE auto | ck_tile::async_load_fence_raw (index_t cnt=0) |
| CK_TILE_DEVICE_EXTERN int8_t | ck_tile::llvm_amdgcn_raw_buffer_load_i8 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i8") |
| CK_TILE_DEVICE_EXTERN int8x2_t | ck_tile::llvm_amdgcn_raw_buffer_load_i8x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i8") |
| CK_TILE_DEVICE_EXTERN int8x4_t | ck_tile::llvm_amdgcn_raw_buffer_load_i8x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i8") |
| CK_TILE_DEVICE_EXTERN int16_t | ck_tile::llvm_amdgcn_raw_buffer_load_i16 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i16") |
| CK_TILE_DEVICE_EXTERN int16x2_t | ck_tile::llvm_amdgcn_raw_buffer_load_i16x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i16") |
| CK_TILE_DEVICE_EXTERN int16x4_t | ck_tile::llvm_amdgcn_raw_buffer_load_i16x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i16") |
| CK_TILE_DEVICE_EXTERN int32_t | ck_tile::llvm_amdgcn_raw_buffer_load_i32 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i32") |
| CK_TILE_DEVICE_EXTERN int32x2_t | ck_tile::llvm_amdgcn_raw_buffer_load_i32x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i32") |
| CK_TILE_DEVICE_EXTERN int32x4_t | ck_tile::llvm_amdgcn_raw_buffer_load_i32x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i32") |
| CK_TILE_DEVICE_EXTERN _Float16 | ck_tile::llvm_amdgcn_raw_buffer_load_fp16 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f16") |
| CK_TILE_DEVICE_EXTERN fp16x2_t | ck_tile::llvm_amdgcn_raw_buffer_load_fp16x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f16") |
| CK_TILE_DEVICE_EXTERN fp16x4_t | ck_tile::llvm_amdgcn_raw_buffer_load_fp16x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f16") |
| CK_TILE_DEVICE_EXTERN float | ck_tile::llvm_amdgcn_raw_buffer_load_fp32 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f32") |
| CK_TILE_DEVICE_EXTERN fp32x2_t | ck_tile::llvm_amdgcn_raw_buffer_load_fp32x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f32") |
| CK_TILE_DEVICE_EXTERN fp32x4_t | ck_tile::llvm_amdgcn_raw_buffer_load_fp32x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f32") |
| CK_TILE_DEVICE_EXTERN void | ck_tile::llvm_amdgcn_raw_buffer_store_i8 (int8_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i8") |
| CK_TILE_DEVICE_EXTERN void | ck_tile::llvm_amdgcn_raw_buffer_store_i8x2 (int8x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i8") |
| CK_TILE_DEVICE_EXTERN void | ck_tile::llvm_amdgcn_raw_buffer_store_i8x4 (int8x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i8") |
| CK_TILE_DEVICE_EXTERN void | ck_tile::llvm_amdgcn_raw_buffer_store_i16 (int16_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16") |
| CK_TILE_DEVICE_EXTERN void | ck_tile::llvm_amdgcn_raw_buffer_store_i16x2 (int16x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i16") |
| CK_TILE_DEVICE_EXTERN void | ck_tile::llvm_amdgcn_raw_buffer_store_i16x4 (int16x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i16") |
| CK_TILE_DEVICE_EXTERN void | ck_tile::llvm_amdgcn_raw_buffer_store_i32 (int32_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i32") |
| CK_TILE_DEVICE_EXTERN void | ck_tile::llvm_amdgcn_raw_buffer_store_ui16 (uint16_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16") |
| CK_TILE_DEVICE_EXTERN void | ck_tile::llvm_amdgcn_raw_buffer_store_ui16x2 (uint16x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i16") |
| CK_TILE_DEVICE_EXTERN void | ck_tile::llvm_amdgcn_raw_buffer_store_ui16x4 (uint16x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i16") |
| CK_TILE_DEVICE_EXTERN void | ck_tile::llvm_amdgcn_raw_buffer_store_i32x2 (int32x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i32") |
| CK_TILE_DEVICE_EXTERN void | ck_tile::llvm_amdgcn_raw_buffer_store_i32x4 (int32x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i32") |
| CK_TILE_DEVICE_EXTERN void | ck_tile::llvm_amdgcn_raw_buffer_store_fp16 (_Float16 vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f16") |
| CK_TILE_DEVICE_EXTERN void | ck_tile::llvm_amdgcn_raw_buffer_store_fp16x2 (fp16x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f16") |
| CK_TILE_DEVICE_EXTERN void | ck_tile::llvm_amdgcn_raw_buffer_store_fp16x4 (fp16x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f16") |
| CK_TILE_DEVICE_EXTERN void | ck_tile::llvm_amdgcn_raw_buffer_store_fp32 (float vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f32") |
| CK_TILE_DEVICE_EXTERN void | ck_tile::llvm_amdgcn_raw_buffer_store_fp32x2 (fp32x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f32") |
| CK_TILE_DEVICE_EXTERN void | ck_tile::llvm_amdgcn_raw_buffer_store_fp32x4 (fp32x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f32") |
| CK_TILE_DEVICE_EXTERN fp16x2_t | ck_tile::llvm_amdgcn_raw_buffer_atomic_add_fp16x2 (fp16x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.v2f16") |
| CK_TILE_DEVICE_EXTERN bf16x2_t | ck_tile::llvm_amdgcn_raw_buffer_atomic_add_bf16x2 (bf16x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.v2bf16") |
| CK_TILE_DEVICE_EXTERN int32_t | ck_tile::llvm_amdgcn_raw_buffer_atomic_add_i32 (int32_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.add.i32") |
| CK_TILE_DEVICE_EXTERN float | ck_tile::llvm_amdgcn_raw_buffer_atomic_add_fp32 (float vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.f32") |
| CK_TILE_DEVICE_EXTERN double | ck_tile::llvm_amdgcn_raw_buffer_atomic_max_fp64 (double vdata, int32x4_t rsrc, int voffset, int soffset, int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64") |
| CK_TILE_DEVICE_EXTERN void | ck_tile::llvm_amdgcn_raw_buffer_load_lds (int32x4_t rsrc, as3_uint32_ptr lds_ptr, index_t size, index_t voffset, index_t soffset, index_t offset, index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds") |
| template<unsigned num_dwords, bool pre_nop = false> | |
| CK_TILE_DEVICE void | ck_tile::async_buffer_load_dwordxn_v (void *smem, int32x4_t rsrc, index_t voffset, index_t, index_t ioffset, index_t=0, bool_constant< pre_nop >={}) |
| CK_TILE_DEVICE void | ck_tile::async_buffer_load_fence (index_t cnt=0) |
| template<index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default> | |
| CK_TILE_DEVICE thread_buffer< int8_t, N > | ck_tile::amd_buffer_load_impl_with_bytes (int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset) |
| template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default> | |
| CK_TILE_DEVICE thread_buffer< T, N > | ck_tile::amd_buffer_load_impl (int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset) |
| template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true, bool pre_nop = false> | |
| CK_TILE_DEVICE void | ck_tile::amd_buffer_load_raw_impl (thread_buffer< T, N > &dst, int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset, index_t src_linear_addr_offset, index_t flag=0, bool_constant< pre_nop >={}) |
| template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool pre_nop = false> | |
| CK_TILE_DEVICE void | ck_tile::amd_async_buffer_load_impl (CK_TILE_LDS_ADDR T *smem, int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset, index_t src_immediate_addr_offset=0, bool_constant< pre_nop >={}) |
| template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true> | |
| CK_TILE_DEVICE void | ck_tile::amd_async_buffer_load (CK_TILE_LDS_ADDR T *smem, int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset, index_t src_immediate_addr_offset=0, index_t flag=0, bool_constant< oob_conditional_check >={}) |
| template<index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default> | |
| CK_TILE_DEVICE void | ck_tile::amd_buffer_store_impl_with_bytes (const thread_buffer< int8_t, N > src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset) |
| template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default> | |
| CK_TILE_DEVICE void | ck_tile::amd_buffer_store_impl (const thread_buffer< T, N > src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset) |
| template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true> | |
| CK_TILE_DEVICE void | ck_tile::amd_buffer_store_raw_impl (const thread_buffer< T, N > &dst_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset, index_t dst_linear_addr_offset, index_t is_valid_element=1) |
| template<typename T, index_t N> | |
| CK_TILE_DEVICE void | ck_tile::amd_buffer_atomic_add_impl (const thread_buffer< T, N > &src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset) |
| template<typename T, index_t N> | |
| CK_TILE_DEVICE void | ck_tile::amd_buffer_atomic_max_impl (const thread_buffer< T, N > src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset) |
| template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true> | |
| CK_TILE_DEVICE thread_buffer< T, N > | ck_tile::amd_buffer_load_invalid_element_return_zero (const T *p_src_wave, index_t src_thread_element_offset, bool src_thread_element_valid, index_t src_element_space_size) |
| template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true> | |
| CK_TILE_DEVICE thread_buffer< T, N > | ck_tile::amd_buffer_load_invalid_element_return_customized_value (const T *p_src_wave, index_t src_thread_element_offset, bool src_thread_element_valid, index_t src_element_space_size, T customized_value) |
| template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true, bool pre_nop = false> | |
| CK_TILE_DEVICE void | ck_tile::amd_buffer_load_raw (thread_buffer< T, N > &dst, const T *p_src_wave, index_t src_thread_element_offset, index_t src_linear_element_offset, index_t src_element_space_size, index_t is_valid_element=0, bool_constant< pre_nop >={}) |
| template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true, bool pre_nop = false> | |
| CK_TILE_DEVICE void | ck_tile::amd_buffer_load_raw (thread_buffer< T, N > &dst, const int32x4_t src_wave_buffer_resource, index_t src_thread_element_offset, index_t src_linear_element_offset, index_t is_valid_element=0, bool_constant< pre_nop >={}) |
| template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool pre_nop = false> | |
| CK_TILE_DEVICE void | ck_tile::amd_async_buffer_load_with_oob_raw (T *smem, const T *p_src_wave, index_t src_thread_element_offset, index_t src_linear_element_offset, index_t src_element_space_size, bool_constant< pre_nop >={}) |
| template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool pre_nop = false> | |
| CK_TILE_DEVICE void | ck_tile::amd_async_buffer_load_with_oob_raw (T *smem, const int32x4_t src_wave_buffer_resource, index_t src_thread_element_offset, index_t src_linear_element_offset, bool_constant< pre_nop >={}) |
| template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = false> | |
| CK_TILE_DEVICE void | ck_tile::amd_async_buffer_load_with_oob (CK_TILE_LDS_ADDR T *smem, const int32x4_t src_wave_buffer_resource, index_t src_thread_element_offset, index_t src_linear_element_offset, bool is_valid_element, bool_constant< oob_conditional_check >={}) |
| template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true> | |
| CK_TILE_DEVICE void | ck_tile::amd_buffer_store (const thread_buffer< T, N > &src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size) |
| template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true> | |
| CK_TILE_DEVICE void | ck_tile::amd_buffer_store_raw (const thread_buffer< T, N > &src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const index_t dst_linear_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size) |
| template<typename T, index_t N> | |
| CK_TILE_DEVICE void | ck_tile::amd_buffer_atomic_add (const thread_buffer< T, N > &src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size) |
| template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true, bool pre_nop = false> | |
| CK_TILE_DEVICE void | ck_tile::amd_buffer_atomic_add_raw (const thread_buffer< T, N > &src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const index_t dst_linear_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size, bool_constant< pre_nop >={}) |
| template<typename T, index_t N> | |
| CK_TILE_DEVICE void | ck_tile::amd_buffer_atomic_max (const thread_buffer< T, N > &src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size) |
Macro Definition Documentation
◆ BUFFER_LOAD_USE_INLINEASM
| #define BUFFER_LOAD_USE_INLINEASM 0 |
◆ CK_TILE_ASYNC_LOAD_WITH_INSTR
| #define CK_TILE_ASYNC_LOAD_WITH_INSTR | ( | instr | ) |
Value:
if constexpr(pre_nop) \
asm volatile("s_nop 4\n" instr " %1, %2, 0 offen offset:%3 lds" \
: "=r"(smem) /*dummy dependency for smem*/ \
: "v"(voffset), "s"(rsrc), "n"(ioffset) \
: "memory"); \
else \
asm volatile(instr " %1, %2, 0 offen offset:%3 lds" \
: "=r"(smem) /*dummy dependency for smem*/ \
: "v"(voffset), "s"(rsrc), "n"(ioffset) \
: "memory");
◆ HAS_RAW_BUFFER_BUILTINS
| #define HAS_RAW_BUFFER_BUILTINS |
Value:
__has_builtin(__builtin_amdgcn_raw_buffer_load_b32) && \
__has_builtin(__builtin_amdgcn_make_buffer_rsrc) && \
__has_builtin(__builtin_amdgcn_raw_buffer_store_b32)
◆ LIKELY
| #define LIKELY | ( | x | ) |
Value:
(__builtin_expect(!!(x), 1))
Typedef Documentation
◆ as3_uint32_ptr
| using as3_uint32_ptr = uint32_t * |