permute_pk_int4.hpp Source File

permute_pk_int4.hpp Source File#

Composable Kernel: permute_pk_int4.hpp Source File
permute_pk_int4.hpp
Go to the documentation of this file.
1// SPDX-License-Identifier: MIT
2// Copyright (c), Advanced Micro Devices, Inc. All rights reserved.
3#pragma once
4
6namespace ck_tile {
7
28template <typename Tensor>
30{
31 auto tensor_row_buf = tensor.data();
32 for(size_t idx = 0; idx < tensor.size(); idx += 4)
33 {
34 int8_t input[8];
35
36 for(int k = 0; k < 4; k++)
37 {
38 int8_t i4x2 = bit_cast<int8_t>(tensor_row_buf[idx + k]);
39 input[k * 2 + 0] = (i4x2 >> 4) & 0xf;
40 input[k * 2 + 1] = (i4x2 >> 0) & 0xf;
41 }
42
43 // permute 0x76543210 => 0x75316420
44 {
45 int8_t hi = input[2];
46 int8_t lo = input[0];
47 int8_t i4x2 = (hi << 4) | lo;
48
49 tensor_row_buf[idx + 0] = bit_cast<pk_int4_t>(i4x2);
50 }
51
52 {
53 int8_t hi = input[6];
54 int8_t lo = input[4];
55 int8_t i4x2 = (hi << 4) | lo;
56
57 tensor_row_buf[idx + 1] = bit_cast<pk_int4_t>(i4x2);
58 }
59
60 {
61 int8_t hi = input[3];
62 int8_t lo = input[1];
63 int8_t i4x2 = (hi << 4) | lo;
64
65 tensor_row_buf[idx + 2] = bit_cast<pk_int4_t>(i4x2);
66 }
67
68 {
69 int8_t hi = input[7];
70 int8_t lo = input[5];
71 int8_t i4x2 = (hi << 4) | lo;
72
73 tensor_row_buf[idx + 3] = bit_cast<pk_int4_t>(i4x2);
74 }
75 }
76}
77
78} // namespace ck_tile
Definition tile/core/algorithm/cluster_descriptor.hpp:13
int8_t int8_t
Definition int8.hpp:20
CK_TILE_HOST_DEVICE constexpr Y bit_cast(const X &x)
Definition bit_cast.hpp:11
void permute_vectors_i4x4_b(Tensor &tensor)
Permute packed int4 vectors for device implementation compatibility.
Definition permute_pk_int4.hpp:29
Tensor wrapper that performs static and dynamic buffer logic. The tensor is based on a descriptor sto...
Definition library/utility/host_tensor.hpp:694
Data::pointer data()
Definition library/utility/host_tensor.hpp:1129
Data::size_type size() const
Definition library/utility/host_tensor.hpp:1137