amd_wave_read_first_lane.hpp Source File

amd_wave_read_first_lane.hpp Source File#

Composable Kernel: amd_wave_read_first_lane.hpp Source File
amd_wave_read_first_lane.hpp
Go to the documentation of this file.
1// SPDX-License-Identifier: MIT
2// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
3
4#pragma once
5
6#include "ck/ck.hpp"
8#include "ck/utility/math.hpp"
9
10#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
11#include <array>
12#include <cstddef>
13#include <cstdint>
14#include <type_traits>
15#endif
16
17namespace ck {
18namespace detail {
19
20template <unsigned SizeInBytes>
22
23template <>
24struct get_carrier<1>
25{
26 using type = uint8_t;
27};
28
29template <>
30struct get_carrier<2>
31{
32 using type = uint16_t;
33};
34
35template <>
36struct get_carrier<3>
37{
38 using type = class carrier
39 {
40 using value_type = uint32_t;
41
43 static_assert(sizeof(bytes) <= sizeof(value_type));
44
45 // replacement of host std::copy_n()
46 template <typename InputIterator, typename Size, typename OutputIterator>
47 __device__ static OutputIterator copy_n(InputIterator from, Size size, OutputIterator to)
48 {
49 if(0 < size)
50 {
51 *to = *from;
52 ++to;
53 for(Size count = 1; count < size; ++count)
54 {
55 *to = *++from;
56 ++to;
57 }
58 }
59
60 return to;
61 }
62
63 // method to trigger template substitution failure
64 __device__ carrier(const carrier& other) noexcept
65 {
66 copy_n(other.bytes.begin(), bytes.Size(), bytes.begin());
67 }
68
69 public:
70 __device__ carrier& operator=(value_type value) noexcept
71 {
72 copy_n(reinterpret_cast<const ck::byte*>(&value), bytes.Size(), bytes.begin());
73
74 return *this;
75 }
76
77 __device__ operator value_type() const noexcept
78 {
79 ck::byte result[sizeof(value_type)];
80
81 copy_n(bytes.begin(), bytes.Size(), result);
82
83 return *reinterpret_cast<const value_type*>(result);
84 }
85 };
86};
87static_assert(sizeof(get_carrier<3>::type) == 3);
88
89template <>
90struct get_carrier<4>
91{
92 using type = uint32_t;
93};
94
95template <unsigned SizeInBytes>
97
98} // namespace detail
99
101{
102 return __builtin_amdgcn_readfirstlane(value);
103}
104
106{
107 return __builtin_amdgcn_readfirstlane(value);
108}
109
111{
112 constexpr unsigned object_size = sizeof(int64_t);
113 constexpr unsigned second_part_offset = object_size / 2;
114 auto* const from_obj = reinterpret_cast<const ck::byte*>(&value);
115 alignas(int64_t) ck::byte to_obj[object_size];
116
117 using Sgpr = uint32_t;
118
119 *reinterpret_cast<Sgpr*>(to_obj) =
120 amd_wave_read_first_lane(*reinterpret_cast<const Sgpr*>(from_obj));
121 *reinterpret_cast<Sgpr*>(to_obj + second_part_offset) =
122 amd_wave_read_first_lane(*reinterpret_cast<const Sgpr*>(from_obj + second_part_offset));
123
124 return *reinterpret_cast<int64_t*>(to_obj);
125}
126
127template <typename Object,
128 typename = ck::enable_if_t<ck::is_class_v<Object> && ck::is_trivially_copyable_v<Object>>>
129__device__ auto amd_wave_read_first_lane(const Object& obj)
130{
131 using Size = unsigned;
132 constexpr Size SgprSize = 4;
133 constexpr Size ObjectSize = sizeof(Object);
134
135 auto* const from_obj = reinterpret_cast<const ck::byte*>(&obj);
136 alignas(Object) ck::byte to_obj[ObjectSize];
137
138 constexpr Size RemainedSize = ObjectSize % SgprSize;
139 constexpr Size CompleteSgprCopyBoundary = ObjectSize - RemainedSize;
140 for(Size offset = 0; offset < CompleteSgprCopyBoundary; offset += SgprSize)
141 {
143
144 *reinterpret_cast<Sgpr*>(to_obj + offset) =
145 amd_wave_read_first_lane(*reinterpret_cast<const Sgpr*>(from_obj + offset));
146 }
147
148 if constexpr(0 < RemainedSize)
149 {
151
152 *reinterpret_cast<Carrier*>(to_obj + CompleteSgprCopyBoundary) = amd_wave_read_first_lane(
153 *reinterpret_cast<const Carrier*>(from_obj + CompleteSgprCopyBoundary));
154 }
155
158 return *reinterpret_cast<Object*>(to_obj);
159}
160
161} // namespace ck
Definition threadwise_tensor_slice_transfer_util.hpp:15
typename get_carrier< SizeInBytes >::type get_carrier_t
Definition amd_wave_read_first_lane.hpp:96
Definition ck.hpp:268
long int64_t
Definition data_type.hpp:464
@ Sgpr
Definition amd_address_space.hpp:19
__device__ uint32_t amd_wave_read_first_lane(uint32_t value)
Definition amd_wave_read_first_lane.hpp:100
typename std::enable_if< B, T >::type enable_if_t
Definition enable_if.hpp:27
const GenericPointer< typename T::ValueType > T2 value
Definition pointer.h:1697
unsigned short uint16_t
Definition stdint.h:125
unsigned int uint32_t
Definition stdint.h:126
signed int int32_t
Definition stdint.h:123
unsigned char uint8_t
Definition stdint.h:124
Definition utility/array.hpp:14
__host__ static __device__ constexpr index_t Size()
Definition utility/array.hpp:20
__host__ __device__ constexpr const TData * begin() const
Definition utility/array.hpp:39
uint8_t type
Definition amd_wave_read_first_lane.hpp:26
uint16_t type
Definition amd_wave_read_first_lane.hpp:32
class carrier { using value_type=uint32_t; Array< ck::byte, 3 > bytes; static_assert(sizeof(bytes)<=sizeof(value_type)); template< typename InputIterator, typename Size, typename OutputIterator > __device__ static OutputIterator copy_n(InputIterator from, Size size, OutputIterator to) { if(0< size) { *to=*from;++to; for(Size count=1;count< size;++count) { *to=*++from;++to; } } return to; } __device__ carrier(const carrier &other) noexcept { copy_n(other.bytes.begin(), bytes.Size(), bytes.begin()); } public: __device__ carrier &operator=(value_type value) noexcept { copy_n(reinterpret_cast< const ck::byte * >(&value), bytes.Size(), bytes.begin()); return *this; } __device__ operator value_type() const noexcept { ck::byte result[sizeof(value_type)]; copy_n(bytes.begin(), bytes.Size(), result); return *reinterpret_cast< const value_type * >(result); } } type
Definition amd_wave_read_first_lane.hpp:38
uint32_t type
Definition amd_wave_read_first_lane.hpp:92
Definition amd_wave_read_first_lane.hpp:21