kernel_launch.hpp Source File

kernel_launch.hpp Source File#

Composable Kernel: kernel_launch.hpp Source File
tile/host/kernel_launch.hpp
Go to the documentation of this file.
1// SPDX-License-Identifier: MIT
2// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
3
4#pragma once
5
6#include <numeric>
7#include <functional>
13#include <cstddef>
14#include <hip/hip_runtime.h>
15
16namespace ck_tile {
17
18template <int MinBlockPerCu, typename Kernel, typename... Args>
19#if CK_TILE_USE_LAUNCH_BOUNDS
20__launch_bounds__(Kernel::kBlockSize, MinBlockPerCu)
21#endif
22 __global__ void kentry(Args... args)
23{
24#if defined(__HIP_DEVICE_COMPILE__)
25 Kernel{}(args...);
26#else
27 (..., (ignore = args, 0));
28#endif
29}
30
31template <typename Arch, int MinBlockPerCu, typename Kernel, typename... Args>
32#if CK_TILE_USE_LAUNCH_BOUNDS
33__launch_bounds__(Kernel::kBlockSize, MinBlockPerCu)
34#endif
35 __global__ void kentry(Args... args)
36{
37#if defined(__HIP_DEVICE_COMPILE__)
38 Kernel{}(args...);
39#else
40 (..., (ignore = args, 0));
41#endif
42}
43
44//
45// return a anonymous functor(lambda) to be called later
46// the KernelImpl should be a class without non-static data member, or let's say
47// can be instantiate with "KernelImpl{}"
48//
49// the "static __device__ operator()(some_arg)" is the entry point of KernelImpl
50//
51// Arch can be used to support linking multiple object files that have the same kernel compiled for
52// different architectures. In this case each object file has to use a different tag (gfx9_t,
53// gfx12_t etc.), so the kernel will have different symbols for each architecture.
54//
55template <int MinBlockPerCu = CK_TILE_MIN_BLOCK_PER_CU,
56 typename Arch = void,
57 typename KernelImpl,
58 typename... Args>
59CK_TILE_HOST auto
60make_kernel(KernelImpl /*f*/, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, Args... args)
61{
62 const auto kernel = []() {
63 if constexpr(std::is_void_v<Arch>)
64 {
65 return kentry<MinBlockPerCu, KernelImpl, Args...>;
66 }
67 else
68 {
69 return kentry<Arch, MinBlockPerCu, KernelImpl, Args...>;
70 }
71 }();
72 return [=](const stream_config& s) {
73 kernel<<<grid_dim, block_dim, lds_byte, s.stream_id_>>>(args...);
74 };
75}
76
77template <typename... Callables>
78CK_TILE_HOST void launch_and_check(const stream_config& sc, Callables&&... callables)
79{
80 // abort the sequence in case of intermediate error
81 if(!((static_cast<void>(callables(sc)), hipPeekAtLastError() == hipSuccess) && ...))
82 {
83 HIP_CHECK_ERROR(hipGetLastError());
84 }
85}
86
87// Measure the preprocess time during the cold iterations
88template <typename TimerType, typename PreprocessFunc>
89CK_TILE_HOST double
90preprocess_profiling_impl(TimerType timer, const stream_config& s, PreprocessFunc preprocess)
91{
92 timer.start(s.stream_id_);
93 for(int i = 0; i < s.nrepeat_; i++)
94 {
95 if constexpr(!std::is_same_v<PreprocessFunc, std::nullptr_t>)
96 {
97 preprocess();
98 }
99 }
100 timer.stop(s.stream_id_);
101
102 return timer.duration() / s.nrepeat_;
103}
104
105template <typename TimerType, typename CallablesFunc, typename PreprocessFunc = std::nullptr_t>
106CK_TILE_HOST double timing_loop_impl(TimerType timer,
107 const stream_config& s,
108 CallablesFunc&& callables_func,
109 PreprocessFunc preprocess = nullptr)
110{
111 for(int i = 0; i < s.cold_niters_; i++)
112 {
113 if constexpr(!std::is_same_v<PreprocessFunc, std::nullptr_t>)
114 {
115 preprocess();
116 }
117 callables_func();
118 }
119 // Only profile preprocess if it's provided
120 auto preprocess_time = 0.0;
121 if constexpr(!std::is_same_v<PreprocessFunc, std::nullptr_t>)
122 {
123 preprocess_time = preprocess_profiling_impl(gpu_timer{}, s, preprocess);
124 }
125
126 int i = 0;
127 timer.start(s.stream_id_);
128 while(i < s.nrepeat_)
129 {
130 if constexpr(!std::is_same_v<PreprocessFunc, std::nullptr_t>)
131 {
132 preprocess();
133 }
134
135 callables_func();
136 i++;
137 }
138 timer.stop(s.stream_id_);
139
140 if(!i)
141 return 0.;
142 return (timer.duration() / s.nrepeat_) - preprocess_time;
143}
144
145// clang-format off
146/*
147 * launch_kernel()
148 *
149 * this is the function to launch arbitrary number of kernels with optional timer(selected by stream_config)
150 * the callables should have signature as "operator()(const stream_config& s){ ... }" to call
151 *
152 * the simplest way is pass in a lambda function, with "[=](const stream_config& s){ call_your_kernel_here() }"
153 * as signature, for the callable (pay attention to the capture list)
154 *
155 * e.g.
156 * ck_tile::launch_kernel(s,
157 * [=](const stream_config& s){ hipMemset(ptr, 0, size) },
158 * [=](const stream_config& s){ some_kernel<<<grids, blocks>>>(arg); }
159 * );
160 *
161 * if you use ck_tile kernel, or similiar to this style (structure with "static __device__ operator()(...){}")
162 * you can pass your kernel to ck_tile::make_kernel(), which will create a anonymous functor for you,
163 * then pass it to ck_tile::launch_kernel()
164 *
165 * e.g.
166 * ck_tile::launch_kernel(s,
167 * ck_tile::make_kernel<T0, B0>(kernel_0{}, grids0, blocks0, 0, kargs0),
168 * ck_tile::make_kernel<T0, B1>(kernel_1{}, grids1, blocks1, 0, kargs1),
169 * ...);
170 **/
171// clang-format on
172template <typename... Callables>
173CK_TILE_HOST float launch_kernel(const stream_config& s, Callables&&... callables)
174{
175 static_assert(sizeof...(callables) > 0, "At least one callable is required!");
176
177 if(!s.time_kernel_)
178 {
179 launch_and_check(s, std::forward<Callables>(callables)...);
180 return 0;
181 }
182
183 auto callables_func = [&]() { launch_and_check(s, std::forward<Callables>(callables)...); };
184
185 if(s.is_gpu_timer_)
186 {
187 return timing_loop_impl(gpu_timer{}, s, callables_func);
188 }
189 else
190 {
191 return timing_loop_impl(cpu_timer{}, s, callables_func);
192 }
193}
194
195template <typename PreprocessFunc, typename... Callables>
196CK_TILE_HOST float
197launch_kernel_time_mask(const stream_config& s, PreprocessFunc preprocess, Callables&&... callables)
198{
199 static_assert(sizeof...(callables) > 0, "At least one callable is required!");
200
201 if(!s.time_kernel_)
202 {
203 preprocess();
204 launch_and_check(s, std::forward<Callables>(callables)...);
205 return 0;
206 }
207
208 auto callables_func = [&]() { launch_and_check(s, std::forward<Callables>(callables)...); };
209
210 if(s.is_gpu_timer_)
211 {
212 return timing_loop_impl(gpu_timer{}, s, callables_func, preprocess);
213 }
214 else
215 {
216 return timing_loop_impl(cpu_timer{}, s, callables_func, preprocess);
217 }
218}
219} // namespace ck_tile
#define CK_TILE_MIN_BLOCK_PER_CU
Definition config.hpp:115
#define CK_TILE_HOST
Definition config.hpp:40
#define HIP_CHECK_ERROR(retval_or_funcall)
Definition host_utility/hip_check_error.hpp:21
Definition tile/core/algorithm/cluster_descriptor.hpp:13
__global__ void kentry(Args... args)
Definition tile/host/kernel_launch.hpp:22
constexpr detail::ignore_t ignore
Definition tile/core/utility/ignore.hpp:20
CK_TILE_HOST double timing_loop_impl(TimerType timer, const stream_config &s, CallablesFunc &&callables_func, PreprocessFunc preprocess=nullptr)
Definition tile/host/kernel_launch.hpp:106
CK_TILE_HOST auto make_kernel(KernelImpl, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, Args... args)
Definition tile/host/kernel_launch.hpp:60
CK_TILE_HOST double preprocess_profiling_impl(TimerType timer, const stream_config &s, PreprocessFunc preprocess)
Definition tile/host/kernel_launch.hpp:90
CK_TILE_HOST void launch_and_check(const stream_config &sc, Callables &&... callables)
Definition tile/host/kernel_launch.hpp:78
CK_TILE_HOST float launch_kernel_time_mask(const stream_config &s, PreprocessFunc preprocess, Callables &&... callables)
Definition tile/host/kernel_launch.hpp:197
CK_TILE_HOST float launch_kernel(const stream_config &s, Callables &&... callables)
Definition tile/host/kernel_launch.hpp:173
Definition timer.hpp:52
Definition timer.hpp:15
Definition ck_tile/host/stream_config.hpp:30
hipStream_t stream_id_
Definition ck_tile/host/stream_config.hpp:31
int cold_niters_
Definition ck_tile/host/stream_config.hpp:34
bool time_kernel_
Definition ck_tile/host/stream_config.hpp:32
int nrepeat_
Definition ck_tile/host/stream_config.hpp:35
bool is_gpu_timer_
Definition ck_tile/host/stream_config.hpp:36