flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16_itl.inc Source File

flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16_itl.inc Source File#

Composable Kernel: flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16_itl.inc Source File
flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16_itl.inc
Go to the documentation of this file.
1// SPDX-License-Identifier: MIT
2// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
3// clang-format off
4
5// define the CK_TILE_** macro before include this file to change kernel variation
6// we will undef everything defined in this file
7
8#ifndef CK_TILE_FLATMM_UK_MFMA
9#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_BF16
10#endif
11
12#if CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_BF16
13#define _UK_MFMA_ "v_mfma_f32_16x16x16_bf16"
14
15#define _UK_PK_CVT_(x0_, x1_, y_) \
16 " v_cmp_u_f32 s[36:37], " x0_ ", " x0_ " \n" \
17 " v_add3_u32 v50, " x0_ ", %[v_nan_lo], 1 \n" \
18 " v_cndmask_b32 v54, v50, %[v_nan_hi], s[36:37] \n" \
19 " v_cmp_u_f32 s[36:37], " x1_ ", " x1_ " \n" \
20 " v_add3_u32 v50, " x1_ ", %[v_nan_lo], 1 \n" \
21 " v_cndmask_b32 v55, v50, %[v_nan_hi], s[36:37] \n" \
22 " v_perm_b32 " y_ ", v55, v54, s52 \n"
23
24#define _UK_ATOMIC_ADD_ "global_atomic_pk_add_bf16"
25
26#elif CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_FP16
27#define _UK_MFMA_ "v_mfma_f32_16x16x16_f16"
28
29#define _UK_PK_CVT_(x0_, x1_, y_) \
30 " v_cvt_f16_f32 v54, " x0_ " \n" \
31 " v_cvt_f16_f32 v55, " x1_ " \n" \
32 " v_pack_b32_f16 " y_ ", v54, v55 \n"
33
34#define _UK_ATOMIC_ADD_ "global_atomic_pk_add_f16"
35
36#endif
37
38";-------------------------------------------------------------\n"
39 " s_mov_b32 s52, 0x07060302 ; v_perm\n"
40 " s_mov_b64 s[38:39], exec ; save current exec\n"
41 " s_mov_b32 s8, %[s_res_o0] \n"
42 " s_mov_b32 s9, %[s_res_o1] \n"
43 " s_mov_b32 s12, %[s_res_b0] \n"
44 " s_mov_b32 s13, %[s_res_b1] \n"
45 " s_mov_b32 s14, %[s_res_b2] \n"
46 " s_mov_b32 s15, %[s_res_b3] \n"
47 " s_mov_b32 s59, 0 \n"
48 " ds_read_b64 v[128:129], %[v_sld_y_os] offset:0 + %[sld_a_base] \n"
49 " ds_read_b64 v[130:131], %[v_sld_y_os] offset:128 + %[sld_a_base] \n"
50 " ds_read_b64 v[132:133], %[v_sld_y_os] offset:1024 + %[sld_a_base] \n"
51 " ds_read_b64 v[134:135], %[v_sld_y_os] offset:1152 + %[sld_a_base] \n"
52 " ds_read_b64 v[136:137], %[v_sld_y_os] offset:2048 + %[sld_a_base] \n"
53 " ds_read_b64 v[138:139], %[v_sld_y_os] offset:2176 + %[sld_a_base] \n"
54 " ds_read_b64 v[140:141], %[v_sld_y_os] offset:3072 + %[sld_a_base] \n"
55 " ds_read_b64 v[142:143], %[v_sld_y_os] offset:3200 + %[sld_a_base] \n"
56 " ds_read_b64 v[144:145], %[v_sld_y_os] offset:4096 + %[sld_a_base] \n"
57 " ds_read_b64 v[146:147], %[v_sld_y_os] offset:4224 + %[sld_a_base] \n"
58 " ds_read_b64 v[148:149], %[v_sld_y_os] offset:5120 + %[sld_a_base] \n"
59 " ds_read_b64 v[150:151], %[v_sld_y_os] offset:5248 + %[sld_a_base] \n"
60 " ds_read_b64 v[152:153], %[v_sld_y_os] offset:6144 + %[sld_a_base] \n"
61 " ds_read_b64 v[154:155], %[v_sld_y_os] offset:6272 + %[sld_a_base] \n"
62 " ds_read_b64 v[156:157], %[v_sld_y_os] offset:7168 + %[sld_a_base] \n"
63 " ds_read_b64 v[158:159], %[v_sld_y_os] offset:7296 + %[sld_a_base] \n"
64 " ds_read_b64 v[160:161], %[v_sld_y_os] offset:8192 + %[sld_a_base] \n"
65 " ds_read_b64 v[162:163], %[v_sld_y_os] offset:8320 + %[sld_a_base] \n"
66 " ds_read_b64 v[164:165], %[v_sld_y_os] offset:9216 + %[sld_a_base] \n"
67 " ds_read_b64 v[166:167], %[v_sld_y_os] offset:9344 + %[sld_a_base] \n"
68 " ds_read_b64 v[168:169], %[v_sld_y_os] offset:10240 + %[sld_a_base] \n"
69 " ds_read_b64 v[170:171], %[v_sld_y_os] offset:10368 + %[sld_a_base] \n"
70 " ds_read_b64 v[172:173], %[v_sld_y_os] offset:11264 + %[sld_a_base] \n"
71 " ds_read_b64 v[174:175], %[v_sld_y_os] offset:11392 + %[sld_a_base] \n"
72 " ds_read_b64 v[176:177], %[v_sld_y_os] offset:12288 + %[sld_a_base] \n"
73 " ds_read_b64 v[178:179], %[v_sld_y_os] offset:12416 + %[sld_a_base] \n"
74 " ds_read_b64 v[180:181], %[v_sld_y_os] offset:13312 + %[sld_a_base] \n"
75 " ds_read_b64 v[182:183], %[v_sld_y_os] offset:13440 + %[sld_a_base] \n"
76 " ds_read_b64 v[184:185], %[v_sld_y_os] offset:14336 + %[sld_a_base] \n"
77 " ds_read_b64 v[186:187], %[v_sld_y_os] offset:14464 + %[sld_a_base] \n"
78 " ds_read_b64 v[188:189], %[v_sld_y_os] offset:15360 + %[sld_a_base] \n"
79 " ds_read_b64 v[190:191], %[v_sld_y_os] offset:15488 + %[sld_a_base] \n"
80 " ds_read_b64 v[192:193], %[v_sld_y_os] offset:16384 + %[sld_a_base] \n"
81 " ds_read_b64 v[194:195], %[v_sld_y_os] offset:16512 + %[sld_a_base] \n"
82 " ds_read_b64 v[196:197], %[v_sld_y_os] offset:17408 + %[sld_a_base] \n"
83 " ds_read_b64 v[198:199], %[v_sld_y_os] offset:17536 + %[sld_a_base] \n"
84 " ds_read_b64 v[200:201], %[v_sld_y_os] offset:18432 + %[sld_a_base] \n"
85 " ds_read_b64 v[202:203], %[v_sld_y_os] offset:18560 + %[sld_a_base] \n"
86 " ds_read_b64 v[204:205], %[v_sld_y_os] offset:19456 + %[sld_a_base] \n"
87 " ds_read_b64 v[206:207], %[v_sld_y_os] offset:19584 + %[sld_a_base] \n"
88 " ds_read_b64 v[208:209], %[v_sld_y_os] offset:20480 + %[sld_a_base] \n"
89 " ds_read_b64 v[210:211], %[v_sld_y_os] offset:20608 + %[sld_a_base] \n"
90 " ds_read_b64 v[212:213], %[v_sld_y_os] offset:21504 + %[sld_a_base] \n"
91 " ds_read_b64 v[214:215], %[v_sld_y_os] offset:21632 + %[sld_a_base] \n"
92 " ds_read_b64 v[216:217], %[v_sld_y_os] offset:22528 + %[sld_a_base] \n"
93 " ds_read_b64 v[218:219], %[v_sld_y_os] offset:22656 + %[sld_a_base] \n"
94 " ds_read_b64 v[220:221], %[v_sld_y_os] offset:23552 + %[sld_a_base] \n"
95 " ds_read_b64 v[222:223], %[v_sld_y_os] offset:23680 + %[sld_a_base] \n"
96 " ds_read_b64 v[224:225], %[v_sld_y_os] offset:24576 + %[sld_a_base] \n"
97 " ds_read_b64 v[226:227], %[v_sld_y_os] offset:24704 + %[sld_a_base] \n"
98 " ds_read_b64 v[228:229], %[v_sld_y_os] offset:25600 + %[sld_a_base] \n"
99 " ds_read_b64 v[230:231], %[v_sld_y_os] offset:25728 + %[sld_a_base] \n"
100 " ds_read_b64 v[232:233], %[v_sld_y_os] offset:26624 + %[sld_a_base] \n"
101 " ds_read_b64 v[234:235], %[v_sld_y_os] offset:26752 + %[sld_a_base] \n"
102 " ds_read_b64 v[236:237], %[v_sld_y_os] offset:27648 + %[sld_a_base] \n"
103 " ds_read_b64 v[238:239], %[v_sld_y_os] offset:27776 + %[sld_a_base] \n"
104 " ds_read_b64 v[240:241], %[v_sld_y_os] offset:28672 + %[sld_a_base] \n"
105 " ds_read_b64 v[242:243], %[v_sld_y_os] offset:28800 + %[sld_a_base] \n"
106 " ds_read_b64 v[244:245], %[v_sld_y_os] offset:29696 + %[sld_a_base] \n"
107 " ds_read_b64 v[246:247], %[v_sld_y_os] offset:29824 + %[sld_a_base] \n"
108 " ds_read_b64 v[248:249], %[v_sld_y_os] offset:30720 + %[sld_a_base] \n"
109 " ds_read_b64 v[250:251], %[v_sld_y_os] offset:30848 + %[sld_a_base] \n"
110 " ds_read_b64 v[252:253], %[v_sld_y_os] offset:31744 + %[sld_a_base] \n"
111 " ds_read_b64 v[254:255], %[v_sld_y_os] offset:31872 + %[sld_a_base] \n"
112 " s_waitcnt 0 \n"
113 " buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[12:15], 0 offen \n"
114 " buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[12:15], 0 offen offset:1024 \n"
115 " buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[12:15], 0 offen offset:2048 \n"
116 " buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[12:15], 0 offen offset:3072 \n"
117 " buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[12:15], 0 offen \n"
118 " buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[12:15], 0 offen offset:1024 \n"
119 " buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[12:15], 0 offen offset:2048 \n"
120 " buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[12:15], 0 offen offset:3072 \n"
121 " buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[12:15], 0 offen \n"
122 " buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[12:15], 0 offen offset:1024 \n"
123 " buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[12:15], 0 offen offset:2048 \n"
124 " buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[12:15], 0 offen offset:3072 \n"
125 " buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[12:15], 0 offen \n"
126 " buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[12:15], 0 offen offset:1024 \n"
127 " buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[12:15], 0 offen offset:2048 \n"
128 " buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[12:15], 0 offen offset:3072 \n"
129 " buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[12:15], 0 offen \n"
130 " buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[12:15], 0 offen offset:1024 \n"
131 " buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[12:15], 0 offen offset:2048 \n"
132 " buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[12:15], 0 offen offset:3072 \n"
133 " buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[12:15], 0 offen \n"
134 " buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[12:15], 0 offen offset:1024 \n"
135 " buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[12:15], 0 offen offset:2048 \n"
136 " buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[12:15], 0 offen offset:3072 \n"
137 " buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[12:15], 0 offen \n"
138 " buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[12:15], 0 offen offset:1024 \n"
139 " buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[12:15], 0 offen offset:2048 \n"
140 " buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[12:15], 0 offen offset:3072 \n"
141 " buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[12:15], 0 offen \n"
142 " buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[12:15], 0 offen offset:1024 \n"
143 " buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[12:15], 0 offen offset:2048 \n"
144 " buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[12:15], 0 offen offset:3072 \n"
145 " s_add_u32 s12, %[s_tile_os_b], s12 \n"
146 " s_addc_u32 s13, 0, s13 \n"
147 " v_mov_b32 v64, 0 \n"
148 " v_mov_b32 v80, 0 \n"
149 " v_mov_b32 v65, 0 \n"
150 " v_mov_b32 v81, 0 \n"
151 " v_mov_b32 v66, 0 \n"
152 " v_mov_b32 v82, 0 \n"
153 " v_mov_b32 v67, 0 \n"
154 " v_mov_b32 v83, 0 \n"
155 " v_mov_b32 v68, 0 \n"
156 " v_mov_b32 v84, 0 \n"
157 " v_mov_b32 v69, 0 \n"
158 " v_mov_b32 v85, 0 \n"
159 " v_mov_b32 v70, 0 \n"
160 " v_mov_b32 v86, 0 \n"
161 " v_mov_b32 v71, 0 \n"
162 " v_mov_b32 v87, 0 \n"
163 " ds_write_b64 %[v_sfl_sst], [%[c0],%[c1]] offset:16640 \n"
164 " ds_write_b64 %[v_sfl_sst], [%[c2],%[c3]] offset:20992 \n"
165 " ds_write_b64 %[v_sfl_sst], [%[c4],%[c5]] offset:18816 \n"
166 " ds_write_b64 %[v_sfl_sst], [%[c6],%[c7]] offset:23168 \n"
167 " s_mov_b32 s80, 0 \n"
168 " s_waitcnt vmcnt(24) \n"
169 "coreloop_top_%=: \n"
170 " s_waitcnt vmcnt(30) & lgkmcnt(0) \n"
171 " s_barrier \n"
172 _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[0:1], v[128:129], 0 \n"
173 " ds_read_b32 v10, %[v_sfl_sld] offset:16640 \n"
174 " ds_read_b32 v11, %[v_sfl_sld] offset:16672 \n"
175 " ds_write_b64 %[v_sfl_sst], [%[c16],%[c17]] offset:25344 \n"
176 _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[2:3], v[130:131], v[64:67] \n"
177 " buffer_load_dwordx4 acc[128:131], %[v_os_b0], s[12:15], 0 offen \n"
178 " ds_write_b64 %[v_sfl_sst], [%[c18],%[c19]] offset:29696 \n"
179 _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[4:5], v[132:133], v[64:67] \n"
180 " ds_read_b32 v12, %[v_sfl_sld] offset:16704 \n"
181 " ds_read_b32 v13, %[v_sfl_sld] offset:16736 \n"
182 " ds_write_b64 %[v_sfl_sst], [%[c20],%[c21]] offset:27520 \n"
183 _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[6:7], v[134:135], v[64:67] \n"
184 " ds_write_b64 %[v_sfl_sst], [%[c22],%[c23]] offset:31872 \n"
185 _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[8:9], v[136:137], v[64:67] \n"
186 " ds_read_b32 v14, %[v_sfl_sld] offset:20992 \n"
187 " ds_read_b32 v15, %[v_sfl_sld] offset:21024 \n"
188 _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[10:11], v[138:139], v[64:67] \n"
189 " buffer_load_dwordx4 acc[132:135], %[v_os_b0], s[12:15], 0 offen offset:1024 \n"
190 _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[12:13], v[140:141], v[64:67] \n"
191 " ds_read_b32 v16, %[v_sfl_sld] offset:21056 \n"
192 " ds_read_b32 v17, %[v_sfl_sld] offset:21088 \n"
193 _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[14:15], v[142:143], v[64:67] \n"
194 _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[0:1], v[192:193], 0 \n"
195 _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[2:3], v[194:195], v[68:71] \n"
196 " buffer_load_dwordx4 acc[136:139], %[v_os_b0], s[12:15], 0 offen offset:2048 \n"
197 _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[4:5], v[196:197], v[68:71] \n"
198 _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[6:7], v[198:199], v[68:71] \n"
199 _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[8:9], v[200:201], v[68:71] \n"
200 _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[10:11], v[202:203], v[68:71] \n"
201 " buffer_load_dwordx4 acc[140:143], %[v_os_b0], s[12:15], 0 offen offset:3072 \n"
202 _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[12:13], v[204:205], v[68:71] \n"
203 _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[14:15], v[206:207], v[68:71] \n"
204 " s_waitcnt lgkmcnt(0) \n"
205 " s_mov_b64 exec, %[s_execflag_0] \n"
206 _UK_ATOMIC_ADD_ " %[v_os_o0], v10, s[8:9] \n"
207 " s_mov_b64 exec, s[38:39] \n"
208 _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[16:17], v[128:129], 0 \n"
209 _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[18:19], v[130:131], v[72:75] \n"
210 " buffer_load_dwordx4 acc[144:147], %[v_os_b1], s[12:15], 0 offen \n"
211 _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[20:21], v[132:133], v[72:75] \n"
212 _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[22:23], v[134:135], v[72:75] \n"
213 _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[24:25], v[136:137], v[72:75] \n"
214 _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[26:27], v[138:139], v[72:75] \n"
215 " buffer_load_dwordx4 acc[148:151], %[v_os_b1], s[12:15], 0 offen offset:1024 \n"
216 _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[28:29], v[140:141], v[72:75] \n"
217 _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[30:31], v[142:143], v[72:75] \n"
218 _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[16:17], v[192:193], 0 \n"
219 _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[18:19], v[194:195], v[76:79] \n"
220 " buffer_load_dwordx4 acc[152:155], %[v_os_b1], s[12:15], 0 offen offset:2048 \n"
221 _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[20:21], v[196:197], v[76:79] \n"
222 _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[22:23], v[198:199], v[76:79] \n"
223 _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[24:25], v[200:201], v[76:79] \n"
224 _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[26:27], v[202:203], v[76:79] \n"
225 " buffer_load_dwordx4 acc[156:159], %[v_os_b1], s[12:15], 0 offen offset:3072 \n"
226 _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[28:29], v[204:205], v[76:79] \n"
227 _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[30:31], v[206:207], v[76:79] \n"
228 " s_mov_b64 exec, %[s_execflag_1] \n"
229 _UK_ATOMIC_ADD_ " %[v_os_o1], v11, s[8:9] \n"
230 " s_mov_b64 exec, s[38:39] \n"
231 " s_waitcnt vmcnt(30) \n"
232 _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[32:33], v[144:145], v[64:67] \n"
233 _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[34:35], v[146:147], v[64:67] \n"
234 " buffer_load_dwordx4 acc[160:163], %[v_os_b2], s[12:15], 0 offen \n"
235 _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[36:37], v[148:149], v[64:67] \n"
236 _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[38:39], v[150:151], v[64:67] \n"
237 _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[40:41], v[152:153], v[64:67] \n"
238 _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[42:43], v[154:155], v[64:67] \n"
239 " buffer_load_dwordx4 acc[164:167], %[v_os_b2], s[12:15], 0 offen offset:1024 \n"
240 _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[44:45], v[156:157], v[64:67] \n"
241 _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[46:47], v[158:159], v[64:67] \n"
242 _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[32:33], v[208:209], v[68:71] \n"
243 _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[34:35], v[210:211], v[68:71] \n"
244 " buffer_load_dwordx4 acc[168:171], %[v_os_b2], s[12:15], 0 offen offset:2048 \n"
245 _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[36:37], v[212:213], v[68:71] \n"
246 _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[38:39], v[214:215], v[68:71] \n"
247 _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[40:41], v[216:217], v[68:71] \n"
248 _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[42:43], v[218:219], v[68:71] \n"
249 " buffer_load_dwordx4 acc[172:175], %[v_os_b2], s[12:15], 0 offen offset:3072 \n"
250 _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[44:45], v[220:221], v[68:71] \n"
251 _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[46:47], v[222:223], v[68:71] \n"
252 " s_mov_b64 exec, %[s_execflag_2] \n"
253 _UK_ATOMIC_ADD_ " %[v_os_o2], v12, s[8:9] \n"
254 " s_mov_b64 exec, s[38:39] \n"
255 _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[48:49], v[144:145], v[72:75] \n"
256 _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[50:51], v[146:147], v[72:75] \n"
257 " buffer_load_dwordx4 acc[176:179], %[v_os_b3], s[12:15], 0 offen \n"
258 _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[52:53], v[148:149], v[72:75] \n"
259 _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[54:55], v[150:151], v[72:75] \n"
260 _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[56:57], v[152:153], v[72:75] \n"
261 _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[58:59], v[154:155], v[72:75] \n"
262 " buffer_load_dwordx4 acc[180:183], %[v_os_b3], s[12:15], 0 offen offset:1024 \n"
263 _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[60:61], v[156:157], v[72:75] \n"
264 _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[62:63], v[158:159], v[72:75] \n"
265 _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[48:49], v[208:209], v[76:79] \n"
266 _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[50:51], v[210:211], v[76:79] \n"
267 " buffer_load_dwordx4 acc[184:187], %[v_os_b3], s[12:15], 0 offen offset:2048 \n"
268 _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[52:53], v[212:213], v[76:79] \n"
269 _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[54:55], v[214:215], v[76:79] \n"
270 _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[56:57], v[216:217], v[76:79] \n"
271 _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[58:59], v[218:219], v[76:79] \n"
272 " buffer_load_dwordx4 acc[188:191], %[v_os_b3], s[12:15], 0 offen offset:3072 \n"
273 _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[60:61], v[220:221], v[76:79] \n"
274 _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[62:63], v[222:223], v[76:79] \n"
275 " s_mov_b64 exec, %[s_execflag_3] \n"
276 _UK_ATOMIC_ADD_ " %[v_os_o3], v13, s[8:9] \n"
277 " s_mov_b64 exec, s[38:39] \n"
278 " s_waitcnt vmcnt(30) \n"
279 _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[64:65], v[160:161], v[64:67] \n"
280 _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[66:67], v[162:163], v[64:67] \n"
281 " buffer_load_dwordx4 acc[192:195], %[v_os_b4], s[12:15], 0 offen \n"
282 _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[68:69], v[164:165], v[64:67] \n"
283 _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[70:71], v[166:167], v[64:67] \n"
284 _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[72:73], v[168:169], v[64:67] \n"
285 _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[74:75], v[170:171], v[64:67] \n"
286 " buffer_load_dwordx4 acc[196:199], %[v_os_b4], s[12:15], 0 offen offset:1024 \n"
287 _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[76:77], v[172:173], v[64:67] \n"
288 _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[78:79], v[174:175], v[64:67] \n"
289 _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[64:65], v[224:225], v[68:71] \n"
290 _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[66:67], v[226:227], v[68:71] \n"
291 " buffer_load_dwordx4 acc[200:203], %[v_os_b4], s[12:15], 0 offen offset:2048 \n"
292 _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[68:69], v[228:229], v[68:71] \n"
293 _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[70:71], v[230:231], v[68:71] \n"
294 _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[72:73], v[232:233], v[68:71] \n"
295 _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[74:75], v[234:235], v[68:71] \n"
296 " buffer_load_dwordx4 acc[204:207], %[v_os_b4], s[12:15], 0 offen offset:3072 \n"
297 _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[76:77], v[236:237], v[68:71] \n"
298 _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[78:79], v[238:239], v[68:71] \n"
299 " s_mov_b64 exec, %[s_execflag_4] \n"
300 _UK_ATOMIC_ADD_ " %[v_os_o4], v14, s[8:9] \n"
301 " s_mov_b64 exec, s[38:39] \n"
302 _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[80:81], v[160:161], v[72:75] \n"
303 _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[82:83], v[162:163], v[72:75] \n"
304 " buffer_load_dwordx4 acc[208:211], %[v_os_b5], s[12:15], 0 offen \n"
305 _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[84:85], v[164:165], v[72:75] \n"
306 _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[86:87], v[166:167], v[72:75] \n"
307 _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[88:89], v[168:169], v[72:75] \n"
308 _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[90:91], v[170:171], v[72:75] \n"
309 " buffer_load_dwordx4 acc[212:215], %[v_os_b5], s[12:15], 0 offen offset:1024 \n"
310 _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[92:93], v[172:173], v[72:75] \n"
311 _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[94:95], v[174:175], v[72:75] \n"
312 _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[80:81], v[224:225], v[76:79] \n"
313 _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[82:83], v[226:227], v[76:79] \n"
314 " buffer_load_dwordx4 acc[216:219], %[v_os_b5], s[12:15], 0 offen offset:2048 \n"
315 _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[84:85], v[228:229], v[76:79] \n"
316 _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[86:87], v[230:231], v[76:79] \n"
317 _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[88:89], v[232:233], v[76:79] \n"
318 _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[90:91], v[234:235], v[76:79] \n"
319 " buffer_load_dwordx4 acc[220:223], %[v_os_b5], s[12:15], 0 offen offset:3072 \n"
320 _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[92:93], v[236:237], v[76:79] \n"
321 _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[94:95], v[238:239], v[76:79] \n"
322 " s_mov_b64 exec, %[s_execflag_5] \n"
323 _UK_ATOMIC_ADD_ " %[v_os_o5], v15, s[8:9] \n"
324 " s_mov_b64 exec, s[38:39] \n"
325 " s_waitcnt vmcnt(30) \n"
326 _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[96:97], v[176:177], v[64:67] \n"
327 _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[98:99], v[178:179], v[64:67] \n"
328 " buffer_load_dwordx4 acc[224:227], %[v_os_b6], s[12:15], 0 offen \n"
329 _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[100:101], v[180:181], v[64:67] \n"
330 _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[102:103], v[182:183], v[64:67] \n"
331 _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[104:105], v[184:185], v[64:67] \n"
332 _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[106:107], v[186:187], v[64:67] \n"
333 " buffer_load_dwordx4 acc[228:231], %[v_os_b6], s[12:15], 0 offen offset:1024 \n"
334 _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[108:109], v[188:189], v[64:67] \n"
335 _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[110:111], v[190:191], v[64:67] \n"
336 _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[96:97], v[240:241], v[68:71] \n"
337 _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[98:99], v[242:243], v[68:71] \n"
338 " buffer_load_dwordx4 acc[232:235], %[v_os_b6], s[12:15], 0 offen offset:2048 \n"
339 _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[100:101], v[244:245], v[68:71] \n"
340 _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[102:103], v[246:247], v[68:71] \n"
341 _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[104:105], v[248:249], v[68:71] \n"
342 _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[106:107], v[250:251], v[68:71] \n"
343 " buffer_load_dwordx4 acc[236:239], %[v_os_b6], s[12:15], 0 offen offset:3072 \n"
344 _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[108:109], v[252:253], v[68:71] \n"
345 _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[110:111], v[254:255], v[68:71] \n"
346 " s_mov_b64 exec, %[s_execflag_6] \n"
347 _UK_ATOMIC_ADD_ " %[v_os_o6], v16, s[8:9] \n"
348 " s_mov_b64 exec, s[38:39] \n"
349 _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[112:113], v[176:177], v[72:75] \n"
350 _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[114:115], v[178:179], v[72:75] \n"
351 " buffer_load_dwordx4 acc[240:243], %[v_os_b7], s[12:15], 0 offen \n"
352 _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[116:117], v[180:181], v[72:75] \n"
353 _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[118:119], v[182:183], v[72:75] \n"
354 _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[120:121], v[184:185], v[72:75] \n"
355 _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[122:123], v[186:187], v[72:75] \n"
356 " buffer_load_dwordx4 acc[244:247], %[v_os_b7], s[12:15], 0 offen offset:1024 \n"
357 _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[124:125], v[188:189], v[72:75] \n"
358 _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[126:127], v[190:191], v[72:75] \n"
359 _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[112:113], v[240:241], v[76:79] \n"
360 _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[114:115], v[242:243], v[76:79] \n"
361 " buffer_load_dwordx4 acc[248:251], %[v_os_b7], s[12:15], 0 offen offset:2048 \n"
362 _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[116:117], v[244:245], v[76:79] \n"
363 _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[118:119], v[246:247], v[76:79] \n"
364 _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[120:121], v[248:249], v[76:79] \n"
365 _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[122:123], v[250:251], v[76:79] \n"
366 " buffer_load_dwordx4 acc[252:255], %[v_os_b7], s[12:15], 0 offen offset:3072 \n"
367 _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[124:125], v[252:253], v[76:79] \n"
368 _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[126:127], v[254:255], v[76:79] \n"
369 " s_mov_b64 exec, %[s_execflag_7] \n"
370 _UK_ATOMIC_ADD_ " %[v_os_o7], v17, s[8:9] \n"
371 " s_mov_b64 exec, s[38:39] \n"
372 " s_add_u32 s60, 0x00000100, s80 \n"
373 " s_cmp_lt_u32 s60, %[s_loop_cnt] \n"
374 " s_cselect_b32 s56, %[s_tile_os_b], 0 \n"
375 " s_add_u32 s12, s56, s12 \n"
376 " s_addc_u32 s13, 0, s13 \n"
377 " s_cmp_ge_u32 s80, 0x00000100 \n"
378 " s_cselect_b32 s59, %[s_tile_os_o], s59 \n"
379 " s_add_u32 s8, s59, s8 \n"
380 " s_addc_u32 s9, 0, s9 \n"
381 " v_mul_f32 %[c0], %[scale_0], %[c0] \n"
382 " v_mul_f32 %[c1], %[scale_0], %[c1] \n"
383 " v_mul_f32 %[c2], %[scale_0], %[c2] \n"
384 " v_mul_f32 %[c3], %[scale_0], %[c3] \n"
385 " v_mul_f32 %[c4], %[scale_1], %[c4] \n"
386 " v_mul_f32 %[c5], %[scale_1], %[c5] \n"
387 " v_mul_f32 %[c6], %[scale_1], %[c6] \n"
388 " v_mul_f32 %[c7], %[scale_1], %[c7] \n"
389 " v_mul_f32 %[c8], %[scale_0], %[c8] \n"
390 " v_mul_f32 %[c9], %[scale_0], %[c9] \n"
391 " v_mul_f32 %[c10], %[scale_0], %[c10] \n"
392 " v_mul_f32 %[c11], %[scale_0], %[c11] \n"
393 " v_mul_f32 %[c12], %[scale_1], %[c12] \n"
394 " v_mul_f32 %[c13], %[scale_1], %[c13] \n"
395 " v_mul_f32 %[c14], %[scale_1], %[c14] \n"
396 " v_mul_f32 %[c15], %[scale_1], %[c15] \n"
397 _UK_PK_CVT_("%[c0]","%[c1]","%[c0]")
398 _UK_PK_CVT_("%[c2]","%[c3]","%[c1]")
399 _UK_PK_CVT_("%[c4]","%[c5]","%[c2]")
400 _UK_PK_CVT_("%[c6]","%[c7]","%[c3]")
401 _UK_PK_CVT_("%[c8]","%[c9]","%[c4]")
402 _UK_PK_CVT_("%[c10]","%[c11]","%[c5]")
403 _UK_PK_CVT_("%[c12]", "%[c13]", "%[c6]")
404 _UK_PK_CVT_("%[c14]","%[c15]","%[c7]")
405 " s_addk_i32 s80, 0x0080 \n"
406 " s_cmp_lt_i32 s80, %[s_loop_cnt] \n"
407 " s_cbranch_scc0 loop_atomic_%= \n"
408 " s_waitcnt vmcnt(30) & lgkmcnt(0) \n"
409 " s_barrier \n"
410 _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[128:129], v[128:129], 0 \n"
411 " ds_read_b32 v10, %[v_sfl_sld] offset:25344 \n"
412 " ds_read_b32 v11, %[v_sfl_sld] offset:25376 \n"
413 " ds_write_b64 %[v_sfl_sst], v[64:65] offset:16640 \n"
414 _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[130:131], v[130:131], v[80:83] \n"
415 " buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[12:15], 0 offen \n"
416 " ds_write_b64 %[v_sfl_sst], v[66:67] offset:20992 \n"
417 _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[132:133], v[132:133], v[80:83] \n"
418 " ds_read_b32 v12, %[v_sfl_sld] offset:25408 \n"
419 " ds_read_b32 v13, %[v_sfl_sld] offset:25440 \n"
420 " ds_write_b64 %[v_sfl_sst], v[68:69] offset:18816 \n"
421 _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[134:135], v[134:135], v[80:83] \n"
422 " ds_write_b64 %[v_sfl_sst], v[70:71] offset:23168 \n"
423 _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[136:137], v[136:137], v[80:83] \n"
424 " ds_read_b32 v14, %[v_sfl_sld] offset:29696 \n"
425 " ds_read_b32 v15, %[v_sfl_sld] offset:29728 \n"
426 _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[138:139], v[138:139], v[80:83] \n"
427 " buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[12:15], 0 offen offset:1024 \n"
428 _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[140:141], v[140:141], v[80:83] \n"
429 " ds_read_b32 v16, %[v_sfl_sld] offset:29760 \n"
430 " ds_read_b32 v17, %[v_sfl_sld] offset:29792 \n"
431 _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[142:143], v[142:143], v[80:83] \n"
432 _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[128:129], v[192:193], 0 \n"
433 _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[130:131], v[194:195], v[84:87] \n"
434 " buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[12:15], 0 offen offset:2048 \n"
435 _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[132:133], v[196:197], v[84:87] \n"
436 _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[134:135], v[198:199], v[84:87] \n"
437 _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[136:137], v[200:201], v[84:87] \n"
438 _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[138:139], v[202:203], v[84:87] \n"
439 " buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[12:15], 0 offen offset:3072 \n"
440 _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[140:141], v[204:205], v[84:87] \n"
441 _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[142:143], v[206:207], v[84:87] \n"
442 " s_waitcnt lgkmcnt(0) \n"
443 " s_mov_b64 exec, %[s_execflag_0] \n"
444 _UK_ATOMIC_ADD_ " %[v_os_o0], v10, s[8:9] \n"
445 " s_mov_b64 exec, s[38:39] \n"
446 _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[144:145], v[128:129], 0 \n"
447 _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[146:147], v[130:131], v[88:91] \n"
448 " buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[12:15], 0 offen \n"
449 _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[148:149], v[132:133], v[88:91] \n"
450 _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[150:151], v[134:135], v[88:91] \n"
451 _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[152:153], v[136:137], v[88:91] \n"
452 _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[154:155], v[138:139], v[88:91] \n"
453 " buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[12:15], 0 offen offset:1024 \n"
454 _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[156:157], v[140:141], v[88:91] \n"
455 _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[158:159], v[142:143], v[88:91] \n"
456 _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[144:145], v[192:193], 0 \n"
457 _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[146:147], v[194:195], v[92:95] \n"
458 " buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[12:15], 0 offen offset:2048 \n"
459 _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[148:149], v[196:197], v[92:95] \n"
460 _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[150:151], v[198:199], v[92:95] \n"
461 _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[152:153], v[200:201], v[92:95] \n"
462 _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[154:155], v[202:203], v[92:95] \n"
463 " buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[12:15], 0 offen offset:3072 \n"
464 _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[156:157], v[204:205], v[92:95] \n"
465 _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[158:159], v[206:207], v[92:95] \n"
466 " s_mov_b64 exec, %[s_execflag_1] \n"
467 _UK_ATOMIC_ADD_ " %[v_os_o1], v11, s[8:9] \n"
468 " s_mov_b64 exec, s[38:39] \n"
469 " s_waitcnt vmcnt(30) \n"
470 _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[160:161], v[144:145], v[80:83] \n"
471 _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[162:163], v[146:147], v[80:83] \n"
472 " buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[12:15], 0 offen \n"
473 _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[164:165], v[148:149], v[80:83] \n"
474 _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[166:167], v[150:151], v[80:83] \n"
475 _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[168:169], v[152:153], v[80:83] \n"
476 _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[170:171], v[154:155], v[80:83] \n"
477 " buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[12:15], 0 offen offset:1024 \n"
478 _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[172:173], v[156:157], v[80:83] \n"
479 _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[174:175], v[158:159], v[80:83] \n"
480 _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[160:161], v[208:209], v[84:87] \n"
481 _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[162:163], v[210:211], v[84:87] \n"
482 " buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[12:15], 0 offen offset:2048 \n"
483 _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[164:165], v[212:213], v[84:87] \n"
484 _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[166:167], v[214:215], v[84:87] \n"
485 _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[168:169], v[216:217], v[84:87] \n"
486 _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[170:171], v[218:219], v[84:87] \n"
487 " buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[12:15], 0 offen offset:3072 \n"
488 _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[172:173], v[220:221], v[84:87] \n"
489 _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[174:175], v[222:223], v[84:87] \n"
490 " s_mov_b64 exec, %[s_execflag_2] \n"
491 _UK_ATOMIC_ADD_ " %[v_os_o2], v12, s[8:9] \n"
492 " s_mov_b64 exec, s[38:39] \n"
493 _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[176:177], v[144:145], v[88:91] \n"
494 _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[178:179], v[146:147], v[88:91] \n"
495 " buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[12:15], 0 offen \n"
496 _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[180:181], v[148:149], v[88:91] \n"
497 _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[182:183], v[150:151], v[88:91] \n"
498 _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[184:185], v[152:153], v[88:91] \n"
499 _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[186:187], v[154:155], v[88:91] \n"
500 " buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[12:15], 0 offen offset:1024 \n"
501 _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[188:189], v[156:157], v[88:91] \n"
502 _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[190:191], v[158:159], v[88:91] \n"
503 _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[176:177], v[208:209], v[92:95] \n"
504 _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[178:179], v[210:211], v[92:95] \n"
505 " buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[12:15], 0 offen offset:2048 \n"
506 _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[180:181], v[212:213], v[92:95] \n"
507 _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[182:183], v[214:215], v[92:95] \n"
508 _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[184:185], v[216:217], v[92:95] \n"
509 _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[186:187], v[218:219], v[92:95] \n"
510 " buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[12:15], 0 offen offset:3072 \n"
511 _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[188:189], v[220:221], v[92:95] \n"
512 _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[190:191], v[222:223], v[92:95] \n"
513 " s_mov_b64 exec, %[s_execflag_3] \n"
514 _UK_ATOMIC_ADD_ " %[v_os_o3], v13, s[8:9] \n"
515 " s_mov_b64 exec, s[38:39] \n"
516 " s_waitcnt vmcnt(30) \n"
517 _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[192:193], v[160:161], v[80:83] \n"
518 _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[194:195], v[162:163], v[80:83] \n"
519 " buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[12:15], 0 offen \n"
520 _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[196:197], v[164:165], v[80:83] \n"
521 _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[198:199], v[166:167], v[80:83] \n"
522 _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[200:201], v[168:169], v[80:83] \n"
523 _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[202:203], v[170:171], v[80:83] \n"
524 " buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[12:15], 0 offen offset:1024 \n"
525 _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[204:205], v[172:173], v[80:83] \n"
526 _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[206:207], v[174:175], v[80:83] \n"
527 _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[192:193], v[224:225], v[84:87] \n"
528 _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[194:195], v[226:227], v[84:87] \n"
529 " buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[12:15], 0 offen offset:2048 \n"
530 _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[196:197], v[228:229], v[84:87] \n"
531 _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[198:199], v[230:231], v[84:87] \n"
532 _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[200:201], v[232:233], v[84:87] \n"
533 _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[202:203], v[234:235], v[84:87] \n"
534 " buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[12:15], 0 offen offset:3072 \n"
535 _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[204:205], v[236:237], v[84:87] \n"
536 _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[206:207], v[238:239], v[84:87] \n"
537 " s_mov_b64 exec, %[s_execflag_4] \n"
538 _UK_ATOMIC_ADD_ " %[v_os_o4], v14, s[8:9] \n"
539 " s_mov_b64 exec, s[38:39] \n"
540 _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[208:209], v[160:161], v[88:91] \n"
541 _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[210:211], v[162:163], v[88:91] \n"
542 " buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[12:15], 0 offen \n"
543 _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[212:213], v[164:165], v[88:91] \n"
544 _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[214:215], v[166:167], v[88:91] \n"
545 _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[216:217], v[168:169], v[88:91] \n"
546 _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[218:219], v[170:171], v[88:91] \n"
547 " buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[12:15], 0 offen offset:1024 \n"
548 _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[220:221], v[172:173], v[88:91] \n"
549 _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[222:223], v[174:175], v[88:91] \n"
550 _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[208:209], v[224:225], v[92:95] \n"
551 _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[210:211], v[226:227], v[92:95] \n"
552 " buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[12:15], 0 offen offset:2048 \n"
553 _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[212:213], v[228:229], v[92:95] \n"
554 _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[214:215], v[230:231], v[92:95] \n"
555 _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[216:217], v[232:233], v[92:95] \n"
556 _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[218:219], v[234:235], v[92:95] \n"
557 " buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[12:15], 0 offen offset:3072 \n"
558 _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[220:221], v[236:237], v[92:95] \n"
559 _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[222:223], v[238:239], v[92:95] \n"
560 " s_mov_b64 exec, %[s_execflag_5] \n"
561 _UK_ATOMIC_ADD_ " %[v_os_o5], v15, s[8:9] \n"
562 " s_mov_b64 exec, s[38:39] \n"
563 " s_waitcnt vmcnt(30) \n"
564 _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[224:225], v[176:177], v[80:83] \n"
565 _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[226:227], v[178:179], v[80:83] \n"
566 " buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[12:15], 0 offen \n"
567 _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[228:229], v[180:181], v[80:83] \n"
568 _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[230:231], v[182:183], v[80:83] \n"
569 _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[232:233], v[184:185], v[80:83] \n"
570 _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[234:235], v[186:187], v[80:83] \n"
571 " buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[12:15], 0 offen "
572 "offset:1024 \n"
573 _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[236:237], v[188:189], v[80:83] \n"
574 _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[238:239], v[190:191], v[80:83] \n"
575 _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[224:225], v[240:241], v[84:87] \n"
576 _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[226:227], v[242:243], v[84:87] \n"
577 " buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[12:15], 0 offen "
578 "offset:2048 \n"
579 _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[228:229], v[244:245], v[84:87] \n"
580 _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[230:231], v[246:247], v[84:87] \n"
581 _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[232:233], v[248:249], v[84:87] \n"
582 _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[234:235], v[250:251], v[84:87] \n"
583 " buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[12:15], 0 offen "
584 "offset:3072 \n"
585 _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[236:237], v[252:253], v[84:87] \n"
586 _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[238:239], v[254:255], v[84:87] \n"
587 " s_mov_b64 exec, %[s_execflag_6] \n"
588 _UK_ATOMIC_ADD_ " %[v_os_o6], v16, s[8:9] \n"
589 " s_mov_b64 exec, s[38:39] \n"
590 _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[240:241], v[176:177], v[88:91] \n"
591 _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[242:243], v[178:179], v[88:91] \n"
592 " buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[12:15], 0 offen \n"
593 _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[244:245], v[180:181], v[88:91] \n"
594 _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[246:247], v[182:183], v[88:91] \n"
595 _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[248:249], v[184:185], v[88:91] \n"
596 _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[250:251], v[186:187], v[88:91] \n"
597 " buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[12:15], 0 offen "
598 "offset:1024 \n"
599 _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[252:253], v[188:189], v[88:91] \n"
600 _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[254:255], v[190:191], v[88:91] \n"
601 _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[240:241], v[240:241], v[92:95] \n"
602 _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[242:243], v[242:243], v[92:95] \n"
603 " buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[12:15], 0 offen "
604 "offset:2048 \n"
605 _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[244:245], v[244:245], v[92:95] \n"
606 _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[246:247], v[246:247], v[92:95] \n"
607 _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[248:249], v[248:249], v[92:95] \n"
608 _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[250:251], v[250:251], v[92:95] \n"
609 " buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[12:15], 0 offen "
610 "offset:3072 \n"
611 _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[252:253], v[252:253], v[92:95] \n"
612 _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[254:255], v[254:255], v[92:95] \n"
613 " s_mov_b64 exec, %[s_execflag_7] \n"
614 _UK_ATOMIC_ADD_ " %[v_os_o7], v17, s[8:9] \n"
615 " s_mov_b64 exec, s[38:39] \n"
616 " s_add_u32 s60, 0x00000100, s80 \n"
617 " s_cmp_lt_u32 s60, %[s_loop_cnt] \n"
618 " s_cselect_b32 s56, s56, 0 \n"
619 " s_add_u32 s12, s56, s12 \n"
620 " s_addc_u32 s13, 0, s13 \n"
621 " s_cmp_ge_u32 s80, 0x00000100 \n"
622 " s_cselect_b32 s59, 0x00000100, s59 \n"
623 " s_add_u32 s8, s59, s8 \n"
624 " s_addc_u32 s9, 0, s9 \n"
625 " v_mul_f32 %[c16], %[scale_0], %[c16] \n"
626 " v_mul_f32 %[c17], %[scale_0], %[c17] \n"
627 " v_mul_f32 %[c18], %[scale_0], %[c18] \n"
628 " v_mul_f32 %[c19], %[scale_0], %[c19] \n"
629 " v_mul_f32 %[c20], %[scale_1], %[c20] \n"
630 " v_mul_f32 %[c21], %[scale_1], %[c21] \n"
631 " v_mul_f32 %[c22], %[scale_1], %[c22] \n"
632 " v_mul_f32 %[c23], %[scale_1], %[c23] \n"
633 " v_mul_f32 %[c24], %[scale_0], %[c24] \n"
634 " v_mul_f32 %[c25], %[scale_0], %[c25] \n"
635 " v_mul_f32 %[c26], %[scale_0], %[c26] \n"
636 " v_mul_f32 %[c27], %[scale_0], %[c27] \n"
637 " v_mul_f32 %[c28], %[scale_1], %[c28] \n"
638 " v_mul_f32 %[c29], %[scale_1], %[c29] \n"
639 " v_mul_f32 %[c30], %[scale_1], %[c30] \n"
640 " v_mul_f32 %[c31], %[scale_1], %[c31] \n"
641 _UK_PK_CVT_("%[c16]", "%[c17]", "%[c16]")
642 _UK_PK_CVT_("%[c18]", "%[c19]", "%[c17]")
643 _UK_PK_CVT_("%[c20]", "%[c21]", "%[c18]")
644 _UK_PK_CVT_("%[c22]", "%[c23]", "%[c19]")
645 _UK_PK_CVT_("%[c24]", "%[c25]", "%[c20]")
646 _UK_PK_CVT_("%[c26]", "%[c27]", "%[c21]")
647 _UK_PK_CVT_("%[c28]", "%[c29]", "%[c22]")
648 _UK_PK_CVT_("%[c30]","%[c31]","%[c23]")
649 " s_addk_i32 s80, 0x0080 \n"
650 " s_cmp_lt_i32 s80, %[s_loop_cnt] \n"
651 " s_cbranch_scc0 loop_atomic_%= \n"
652 " s_branch coreloop_top_%= \n"
653 " loop_atomic_%=: \n"
654 " s_waitcnt lgkmcnt(0) \n"
655 " s_barrier \n"
656 " ds_read_b32 v10, %[v_sfl_sld] offset:16640 \n"
657 " ds_read_b32 v11, %[v_sfl_sld] offset:16672 \n"
658 " ds_read_b32 v12, %[v_sfl_sld] offset:16704 \n"
659 " ds_read_b32 v13, %[v_sfl_sld] offset:16736 \n"
660 " ds_read_b32 v14, %[v_sfl_sld] offset:20992 \n"
661 " ds_read_b32 v15, %[v_sfl_sld] offset:21024 \n"
662 " ds_read_b32 v16, %[v_sfl_sld] offset:21056 \n"
663 " ds_read_b32 v17, %[v_sfl_sld] offset:21088 \n"
664 " s_waitcnt lgkmcnt(0) \n"
665 " s_mov_b64 exec, %[s_execflag_0] \n"
666 _UK_ATOMIC_ADD_ " %[v_os_o0], v10, s[8:9] \n"
667 " s_mov_b64 exec, %[s_execflag_1] \n"
668 _UK_ATOMIC_ADD_ " %[v_os_o1], v11, s[8:9] \n"
669 " s_mov_b64 exec, %[s_execflag_2] \n"
670 _UK_ATOMIC_ADD_ " %[v_os_o2], v12, s[8:9] \n"
671 " s_mov_b64 exec, %[s_execflag_3] \n"
672 _UK_ATOMIC_ADD_ " %[v_os_o3], v13, s[8:9] \n"
673 " s_mov_b64 exec, %[s_execflag_4] \n"
674 _UK_ATOMIC_ADD_ " %[v_os_o4], v14, s[8:9] \n"
675 " s_mov_b64 exec, %[s_execflag_5] \n"
676 _UK_ATOMIC_ADD_ " %[v_os_o5], v15, s[8:9] \n"
677 " s_mov_b64 exec, %[s_execflag_6] \n"
678 _UK_ATOMIC_ADD_ " %[v_os_o6], v16, s[8:9] \n"
679 " s_mov_b64 exec, %[s_execflag_7] \n"
680 _UK_ATOMIC_ADD_ " %[v_os_o7], v17, s[8:9] \n"
681 " s_mov_b64 exec, s[38:39] \n"
682 " s_add_u32 s8, s59, s8 \n"
683 " s_addc_u32 s9, 0, s9 \n"
684 " ds_write_b64 %[v_sfl_sst], [%[c16],%[c17]] "
685 "offset:25344 \n"
686 " ds_write_b64 %[v_sfl_sst], [%[c18],%[c19]] "
687 "offset:29696 \n"
688 " ds_write_b64 %[v_sfl_sst], [%[c20],%[c21]] "
689 "offset:27520 \n"
690 " ds_write_b64 %[v_sfl_sst], [%[c22],%[c23]] "
691 "offset:31872 \n"
692 " s_waitcnt lgkmcnt(0) \n"
693 " s_barrier \n"
694 " ds_read_b32 v10, %[v_sfl_sld] offset:25344 \n"
695 " ds_read_b32 v11, %[v_sfl_sld] offset:25376 \n"
696 " ds_read_b32 v12, %[v_sfl_sld] offset:25408 \n"
697 " ds_read_b32 v13, %[v_sfl_sld] offset:25440 \n"
698 " ds_read_b32 v14, %[v_sfl_sld] offset:29696 \n"
699 " ds_read_b32 v15, %[v_sfl_sld] offset:29728 \n"
700 " ds_read_b32 v16, %[v_sfl_sld] offset:29760 \n"
701 " ds_read_b32 v17, %[v_sfl_sld] offset:29792 \n"
702 " s_waitcnt lgkmcnt(0) \n"
703 " s_mov_b64 exec, %[s_execflag_0] \n"
704 _UK_ATOMIC_ADD_ " %[v_os_o0], v10, s[8:9] \n"
705 " s_mov_b64 exec, %[s_execflag_1] \n"
706 _UK_ATOMIC_ADD_ " %[v_os_o1], v11, s[8:9] \n"
707 " s_mov_b64 exec, %[s_execflag_2] \n"
708 _UK_ATOMIC_ADD_ " %[v_os_o2], v12, s[8:9] \n"
709 " s_mov_b64 exec, %[s_execflag_3] \n"
710 _UK_ATOMIC_ADD_ " %[v_os_o3], v13, s[8:9] \n"
711 " s_mov_b64 exec, %[s_execflag_4] \n"
712 _UK_ATOMIC_ADD_ " %[v_os_o4], v14, s[8:9] \n"
713 " s_mov_b64 exec, %[s_execflag_5] \n"
714 _UK_ATOMIC_ADD_ " %[v_os_o5], v15, s[8:9] \n"
715 " s_mov_b64 exec, %[s_execflag_6] \n"
716 _UK_ATOMIC_ADD_ " %[v_os_o6], v16, s[8:9] \n"
717 " s_mov_b64 exec, %[s_execflag_7] \n"
718 _UK_ATOMIC_ADD_ " %[v_os_o7], v17, s[8:9] \n"
719 " s_mov_b64 exec, s[38:39] \n"
720
721#undef _UK_MFMA_
722#undef _UK_PK_CVT_
723#undef _UK_ATOMIC_ADD_
724#undef CK_TILE_FLATMM_UK_MFMA
725 // clang-format on
#define _UK_MFMA_
#define _UK_PK_CVT_(x0_, x1_, y_)
#define _UK_ATOMIC_ADD_