24template <
typename SrcData,
28 typename SliceLengths,
29 typename DimAccessOrder,
30 typename SrcVectorTensorLengths,
31 typename SrcVectorTensorContiguousDimOrder,
32 typename enable_if<SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
50 static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
51 "wrong! SrcDesc and DstDesc need to known at compile-time");
54 static_assert(SliceLengths::At(i) % SrcVectorTensorLengths::At(i) == 0,
"wrong!");
58 template <
typename SrcRefToOriginDisplacement,
59 typename DstOriginIdx,
62 __device__
void Run(
const SrcDesc&,
63 const SrcRefToOriginDisplacement&,
64 const SrcBuffer& src_buf,
67 DstBuffer& dst_buf)
const
69 static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
70 "wrong! SrcDesc and DstDesc need to known at compile-time");
75 "wrong! SrcBuffer or DstBuffer data type is wrong");
77 static_assert(DstBuffer::IsStaticBuffer(),
"wrong! DstBuffer need to be StaticBuffer");
81 "wrong! SrcOriginToRefDistance and DstOriginToRefDistance need to be known "
89 constexpr auto src_ref_to_origin_disp_idx =
to_multi_index(SrcRefToOriginDisplacement{});
93 constexpr auto src_vector_tensor_lengths = SrcVectorTensorLengths{};
98 SrcVectorTensorContiguousDimOrder{}),
101 SrcVectorTensorContiguousDimOrder{});
103 constexpr auto src_vector_desc =
108 constexpr auto access_lengths = SliceLengths{} / src_vector_tensor_lengths;
110 constexpr auto dim_access_order = DimAccessOrder{};
112 constexpr auto ordered_access_lengths =
115 static_ford<
decltype(ordered_access_lengths)>{}([&](
auto ordered_access_idx) {
117 constexpr auto data_to_origin_disp_idx =
118 ordered_access_idx.ReorderGivenOld2New(dim_access_order) *
119 src_vector_tensor_lengths;
122 constexpr auto src_ref_to_data_disp_idx =
123 src_ref_to_origin_disp_idx + data_to_origin_disp_idx;
125 constexpr auto src_ref_to_data_disp_coord_step =
128 auto src_data_coord = src_ref_coord_;
134 using src_vector_t =
typename decltype(src_vector)::type;
137 src_desc, src_data_coord);
140 src_vector.template AsType<src_vector_t>()(
I0) =
141 src_buf.template Get<src_vector_t>(src_data_coord.GetOffset(), is_src_valid);
147 constexpr index_t src_vector_offset =
148 src_vector_desc.CalculateOffset(src_vector_idx);
150 constexpr index_t dst_offset = dst_desc.CalculateOffset(
151 dst_origin_idx + data_to_origin_disp_idx + src_vector_idx);
159 template <
typename SrcSliceMoveStepIdx>
161 const SrcSliceMoveStepIdx& src_slice_move_step_idx)
163 constexpr auto src_desc = SrcDesc{};
165 const auto src_slice_move_step_iter =
int32_t index_t
Definition ck.hpp:299
__host__ __device__ constexpr auto make_tensor_coordinate_step(const TensorDesc &, const VisibleIndex &idx_diff_visible, UpdateLowerIndexHack)
Definition tensor_description/tensor_descriptor.hpp:444
__host__ __device__ constexpr void move_tensor_coordinate(const TensorDesc &tensor_desc, TensorCoord &coord, const TensorCoordStep &coord_step)
Definition tensor_description/tensor_descriptor.hpp:508
__host__ __device__ constexpr auto make_naive_tensor_descriptor(const Tuple< Lengths... > &lengths, const Tuple< Strides... > &strides)
Definition tensor_descriptor_helper.hpp:49
remove_cv_t< remove_reference_t< T > > remove_cvref_t
Definition type.hpp:297
__host__ __device__ constexpr bool coordinate_has_valid_offset_assuming_visible_index_is_valid(const TensorDesc &tensor_desc, const TensorCoord &coord)
Definition tensor_description/tensor_descriptor.hpp:560
integral_constant< index_t, N > Number
Definition number.hpp:12
__host__ __device__ constexpr auto sequence_to_tuple_of_number(Sequence< Is... >)
Definition utility/container_helper.hpp:380
__host__ __device__ constexpr Y type_convert(X x)
Definition utility/type_convert.hpp:98
std::enable_if< B, T > enable_if
Definition enable_if.hpp:24
__host__ __device__ constexpr auto container_reorder_given_old2new(const Array< TData, NSize > &old_array, Sequence< IRs... > old2new)
Definition utility/container_helper.hpp:54
__host__ __device__ constexpr auto to_multi_index(const T &x)
Definition array_multi_index.hpp:28
__host__ __device__ constexpr auto container_reverse_exclusive_scan(const Array< TData, NSize > &x, Reduce f, TData init)
Definition utility/container_helper.hpp:213
__host__ __device__ constexpr auto make_tensor_coordinate(const TensorDesc &tensor_desc, const VisibleIndex &idx_visible)
Definition tensor_description/tensor_descriptor.hpp:407
__host__ __device__ constexpr auto container_reorder_given_new2old(const Array< TData, NSize > &old_array, Sequence< IRs... >)
Definition utility/container_helper.hpp:43
Array< index_t, N > MultiIndex
Definition array_multi_index.hpp:12
typename vector_type_maker< T, N >::type vector_type_maker_t
Definition dtype_vector.hpp:54
const GenericPointer< typename T::ValueType > T2 value
Definition pointer.h:1697
ck::ThreadwiseTensorSliceTransfer_v4r1< FloatA, FloatA, decltype(a_block_desc_bk0_bm0_bm1_bk1_), decltype(a_thread_desc_bk0_bm0_bm1_bk1_), Sequence< BK0PerThread, 1, BM1PerThreadBM11, BK1 >, Sequence< 0, 1, 2, 3 >, Sequence< 1, 1, BM1PerThreadBM11, BK1 >, Sequence< 0, 1, 2, 3 > >::nDim static constexpr index_t nDim
Definition threadwise_tensor_slice_transfer_v4r1.hpp:39
__device__ constexpr ThreadwiseTensorSliceTransfer_v4r1(const Index &src_ref_idx)
Definition threadwise_tensor_slice_transfer_v4r1.hpp:47
ck::ThreadwiseTensorSliceTransfer_v4r1< FloatA, FloatA, decltype(a_block_desc_bk0_bm0_bm1_bk1_), decltype(a_thread_desc_bk0_bm0_bm1_bk1_), Sequence< BK0PerThread, 1, BM1PerThreadBM11, BK1 >, Sequence< 0, 1, 2, 3 >, Sequence< 1, 1, BM1PerThreadBM11, BK1 >, Sequence< 0, 1, 2, 3 > >::SrcCoord decltype(make_tensor_coordinate(decltype(a_block_desc_bk0_bm0_bm1_bk1_){}, Index{})) SrcCoord
Definition threadwise_tensor_slice_transfer_v4r1.hpp:43
ck::ThreadwiseTensorSliceTransfer_v4r1< FloatA, FloatA, decltype(a_block_desc_bk0_bm0_bm1_bk1_), decltype(a_thread_desc_bk0_bm0_bm1_bk1_), Sequence< BK0PerThread, 1, BM1PerThreadBM11, BK1 >, Sequence< 0, 1, 2, 3 >, Sequence< 1, 1, BM1PerThreadBM11, BK1 >, Sequence< 0, 1, 2, 3 > >::I1 static constexpr auto I1
Definition threadwise_tensor_slice_transfer_v4r1.hpp:37
ck::ThreadwiseTensorSliceTransfer_v4r1< FloatA, FloatA, decltype(a_block_desc_bk0_bm0_bm1_bk1_), decltype(a_thread_desc_bk0_bm0_bm1_bk1_), Sequence< BK0PerThread, 1, BM1PerThreadBM11, BK1 >, Sequence< 0, 1, 2, 3 >, Sequence< 1, 1, BM1PerThreadBM11, BK1 >, Sequence< 0, 1, 2, 3 > >::Index MultiIndex< nDim > Index
Definition threadwise_tensor_slice_transfer_v4r1.hpp:41
__device__ void MoveSrcSliceWindow(const SrcDesc &, const SrcSliceMoveStepIdx &src_slice_move_step_idx)
Definition threadwise_tensor_slice_transfer_v4r1.hpp:160
__device__ void Run(const SrcDesc &, const SrcRefToOriginDisplacement &, const SrcBuffer &src_buf, const DstDesc &, const DstOriginIdx &, DstBuffer &dst_buf) const
Definition threadwise_tensor_slice_transfer_v4r1.hpp:62
ck::ThreadwiseTensorSliceTransfer_v4r1< FloatA, FloatA, decltype(a_block_desc_bk0_bm0_bm1_bk1_), decltype(a_thread_desc_bk0_bm0_bm1_bk1_), Sequence< BK0PerThread, 1, BM1PerThreadBM11, BK1 >, Sequence< 0, 1, 2, 3 >, Sequence< 1, 1, BM1PerThreadBM11, BK1 >, Sequence< 0, 1, 2, 3 > >::SrcCoordStep decltype(make_tensor_coordinate_step(decltype(a_block_desc_bk0_bm0_bm1_bk1_){}, Index{})) SrcCoordStep
Definition threadwise_tensor_slice_transfer_v4r1.hpp:45
ck::ThreadwiseTensorSliceTransfer_v4r1< FloatA, FloatA, decltype(a_block_desc_bk0_bm0_bm1_bk1_), decltype(a_thread_desc_bk0_bm0_bm1_bk1_), Sequence< BK0PerThread, 1, BM1PerThreadBM11, BK1 >, Sequence< 0, 1, 2, 3 >, Sequence< 1, 1, BM1PerThreadBM11, BK1 >, Sequence< 0, 1, 2, 3 > >::I0 static constexpr auto I0
Definition threadwise_tensor_slice_transfer_v4r1.hpp:36
Definition is_known_at_compile_time.hpp:14
Definition utility/math.hpp:34
Definition functional2.hpp:33
Definition functional3.hpp:97