DeviceGemmMultipleDLayernorm_Wmma_CShuffleV3< ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, DsDataType, HDataType, AccDataType, CShuffleDataType, EMeanVarDataType, GammaDataType, BetaDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, HElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerWmma, NPerWmma, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector, LayernormThreadClusterSize_M_N, LayernormThreadSliceSize_M, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB > Struct Template Reference

DeviceGemmMultipleDLayernorm_Wmma_CShuffleV3&lt; ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, DsDataType, HDataType, AccDataType, CShuffleDataType, EMeanVarDataType, GammaDataType, BetaDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, HElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerWmma, NPerWmma, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector, LayernormThreadClusterSize_M_N, LayernormThreadSliceSize_M, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB &gt; Struct Template Reference#

Composable Kernel: ck::tensor_operation::device::DeviceGemmMultipleDLayernorm_Wmma_CShuffleV3< ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, DsDataType, HDataType, AccDataType, CShuffleDataType, EMeanVarDataType, GammaDataType, BetaDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, HElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerWmma, NPerWmma, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector, LayernormThreadClusterSize_M_N, LayernormThreadSliceSize_M, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB > Struct Template Reference
ck::tensor_operation::device::DeviceGemmMultipleDLayernorm_Wmma_CShuffleV3< ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, DsDataType, HDataType, AccDataType, CShuffleDataType, EMeanVarDataType, GammaDataType, BetaDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, HElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerWmma, NPerWmma, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector, LayernormThreadClusterSize_M_N, LayernormThreadSliceSize_M, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB > Struct Template Reference

#include <device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp>

Inheritance diagram for ck::tensor_operation::device::DeviceGemmMultipleDLayernorm_Wmma_CShuffleV3< ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, DsDataType, HDataType, AccDataType, CShuffleDataType, EMeanVarDataType, GammaDataType, BetaDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, HElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerWmma, NPerWmma, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector, LayernormThreadClusterSize_M_N, LayernormThreadSliceSize_M, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >:
ck::tensor_operation::device::DeviceGemmMultipleDLayernorm< ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, DsDataType, GammaDataType, BetaDataType, HDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, HElementwiseOperation > ck::tensor_operation::device::BaseOperator

Classes

struct  Argument
struct  Invoker

Public Types

using DeviceOp = DeviceGemmMultipleDLayernorm_Wmma_CShuffleV3
using LayernormBlockTileSize_M_N
using CDEShuffleBlockTransferScalarPerVectors
using GridwiseGemmWelford
using LayernormMeanVarGridDesc_M_NBlock
using LayernormCountGridDesc_M_NBlock
using GammaBetaGridDesc_N = decltype(MakeDescriptor_X<LayernormBlockTileSize_M_N::At(1)>(1))
using EHGridDesc_M_N = decltype(MakeEHGridDescriptor_M_N<Sequence<true, true>, 1, 1>(1, 1, 1))
using GridwiseWelfordLayernorm

Public Member Functions

size_t GetWorkSpaceSize (const BaseArgument *pArg) const override
void SetWorkSpacePointer (BaseArgument *pArg, void *p_workspace, const StreamConfig &=StreamConfig{}) const override
bool IsSupportedArgument (const BaseArgument *p_arg) override
std::unique_ptr< BaseArgumentMakeArgumentPointer (const void *p_a, const void *p_b, std::array< const void *, NumDTensor > p_ds, const void *p_gamma, const void *p_beta, void *p_h, index_t MRaw, index_t NRaw, index_t KRaw, index_t StrideA, index_t StrideB, std::array< index_t, NumDTensor > StrideDs, index_t StrideH, double epsilon, AElementwiseOperation a_element_op, BElementwiseOperation b_element_op, CDEElementwiseOperation cde_element_op, HElementwiseOperation h_element_op) override
std::unique_ptr< BaseInvokerMakeInvokerPointer () override
std::string GetTypeString () const override
Public Member Functions inherited from ck::tensor_operation::device::BaseOperator
 BaseOperator ()=default
 BaseOperator (const BaseOperator &)=default
BaseOperatoroperator= (const BaseOperator &)=default
virtual std::string GetInstanceString () const
virtual std::string GetTypeIdName () const
virtual std::optional< std::string > GetObjectName () const
virtual std::optional< std::string > GetTemplateInfo () const
virtual std::string GetTypeIdHashCode () const
virtual ~BaseOperator ()

Static Public Member Functions

template<typename DoPads, index_t MPerTile, index_t NPerTile>
static auto MakeEHGridDescriptor_M_N (index_t M, index_t N, index_t Stride)
template<index_t XPerTile>
static auto MakeDescriptor_X (index_t X)
static bool IsSupportedArgument (const Argument &arg)
static auto MakeArgument (const void *p_a, const void *p_b, std::array< const void *, NumDTensor > p_ds, const void *p_gamma, const void *p_beta, void *p_h, index_t MRaw, index_t NRaw, index_t KRaw, index_t StrideA, index_t StrideB, std::array< index_t, NumDTensor > StrideDs, index_t StrideH, double epsilon, AElementwiseOperation a_element_op, BElementwiseOperation b_element_op, CDEElementwiseOperation cde_element_op, HElementwiseOperation h_element_op)
static auto MakeInvoker ()

Static Public Attributes

static constexpr index_t NumDTensor = DsDataType::Size()
static constexpr index_t LayernormHDstVectorSize = CDEShuffleBlockTransferScalarPerVector
static constexpr index_t LayernormGammaSrcVectorSize = CDEShuffleBlockTransferScalarPerVector
static constexpr index_t LayernormBetaSrcVectorSize = CDEShuffleBlockTransferScalarPerVector
static constexpr index_t LayernormESrcVectorSize = CDEShuffleBlockTransferScalarPerVector
static constexpr index_t LayernormThreadSliceSize_N = CDEShuffleBlockTransferScalarPerVector
static constexpr auto I0 = Number<0>{}
static constexpr auto I1 = Number<1>{}
static constexpr auto I2 = Number<2>{}
static constexpr auto I3 = Number<3>{}
Static Public Attributes inherited from ck::tensor_operation::device::DeviceGemmMultipleDLayernorm< ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, DsDataType, GammaDataType, BetaDataType, HDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, HElementwiseOperation >
static constexpr index_t NumDTensor = DsDataType::Size()

Member Typedef Documentation

◆ CDEShuffleBlockTransferScalarPerVectors

template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
using ck::tensor_operation::device::DeviceGemmMultipleDLayernorm_Wmma_CShuffleV3< ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, DsDataType, HDataType, AccDataType, CShuffleDataType, EMeanVarDataType, GammaDataType, BetaDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, HElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerWmma, NPerWmma, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector, LayernormThreadClusterSize_M_N, LayernormThreadSliceSize_M, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::CDEShuffleBlockTransferScalarPerVectors
Initial value:
Sequence<CDEShuffleBlockTransferScalarPerVector,
CDEShuffleBlockTransferScalarPerVector,
CDEShuffleBlockTransferScalarPerVector>
Definition utility/sequence.hpp:43

◆ DeviceOp

template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
using ck::tensor_operation::device::DeviceGemmMultipleDLayernorm_Wmma_CShuffleV3< ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, DsDataType, HDataType, AccDataType, CShuffleDataType, EMeanVarDataType, GammaDataType, BetaDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, HElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerWmma, NPerWmma, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector, LayernormThreadClusterSize_M_N, LayernormThreadSliceSize_M, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::DeviceOp = DeviceGemmMultipleDLayernorm_Wmma_CShuffleV3

◆ EHGridDesc_M_N

template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
using ck::tensor_operation::device::DeviceGemmMultipleDLayernorm_Wmma_CShuffleV3< ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, DsDataType, HDataType, AccDataType, CShuffleDataType, EMeanVarDataType, GammaDataType, BetaDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, HElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerWmma, NPerWmma, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector, LayernormThreadClusterSize_M_N, LayernormThreadSliceSize_M, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::EHGridDesc_M_N = decltype(MakeEHGridDescriptor_M_N<Sequence<true, true>, 1, 1>(1, 1, 1))

◆ GammaBetaGridDesc_N

template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
using ck::tensor_operation::device::DeviceGemmMultipleDLayernorm_Wmma_CShuffleV3< ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, DsDataType, HDataType, AccDataType, CShuffleDataType, EMeanVarDataType, GammaDataType, BetaDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, HElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerWmma, NPerWmma, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector, LayernormThreadClusterSize_M_N, LayernormThreadSliceSize_M, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::GammaBetaGridDesc_N = decltype(MakeDescriptor_X<LayernormBlockTileSize_M_N::At(1)>(1))

◆ GridwiseGemmWelford

template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
using ck::tensor_operation::device::DeviceGemmMultipleDLayernorm_Wmma_CShuffleV3< ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, DsDataType, HDataType, AccDataType, CShuffleDataType, EMeanVarDataType, GammaDataType, BetaDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, HElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerWmma, NPerWmma, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector, LayernormThreadClusterSize_M_N, LayernormThreadSliceSize_M, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::GridwiseGemmWelford

◆ GridwiseWelfordLayernorm

template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
using ck::tensor_operation::device::DeviceGemmMultipleDLayernorm_Wmma_CShuffleV3< ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, DsDataType, HDataType, AccDataType, CShuffleDataType, EMeanVarDataType, GammaDataType, BetaDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, HElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerWmma, NPerWmma, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector, LayernormThreadClusterSize_M_N, LayernormThreadSliceSize_M, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::GridwiseWelfordLayernorm
Initial value:
HDataType,
GammaDataType,
BetaDataType,
AccDataType,
HElementwiseOperation,
BlockSize,
LayernormThreadClusterSize_M_N::At(I0),
LayernormThreadClusterSize_M_N::At(I1),
LayernormThreadSliceSize_M,
Definition gridwise_welford_second_half_layernorm2d.hpp:42
decltype(GridwiseGemmWelford::EpilogueWelfordCShuffle::template MakeMeanVarDescriptor_M_N< Sequence< true, true >, LayernormBlockTileSize_M_N::At(0), LayernormBlockTileSize_M_N::At(1)>(1, 1)) LayernormMeanVarGridDesc_M_NBlock
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:295
static constexpr index_t LayernormGammaSrcVectorSize
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:206
decltype(MakeEHGridDescriptor_M_N< Sequence< true, true >, 1, 1 >(1, 1, 1)) EHGridDesc_M_N
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:308
static constexpr index_t LayernormHDstVectorSize
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:205
static constexpr index_t LayernormESrcVectorSize
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:208
static constexpr index_t LayernormBetaSrcVectorSize
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:207
static constexpr auto I0
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:215
decltype(MakeDescriptor_X< LayernormBlockTileSize_M_N::At(1)>(1)) GammaBetaGridDesc_N
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:307
decltype(GridwiseGemmWelford::EpilogueWelfordCShuffle::template MakeCountDescriptor_M_N< Sequence< true, true >, LayernormBlockTileSize_M_N::At(0), LayernormBlockTileSize_M_N::At(1)>(1, 1)) LayernormCountGridDesc_M_NBlock
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:301
static constexpr auto I1
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:216
static constexpr index_t LayernormThreadSliceSize_N
Definition device_gemm_multiple_d_layernorm_wmma_cshuffle_v3.hpp:209

◆ LayernormBlockTileSize_M_N

template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
using ck::tensor_operation::device::DeviceGemmMultipleDLayernorm_Wmma_CShuffleV3< ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, DsDataType, HDataType, AccDataType, CShuffleDataType, EMeanVarDataType, GammaDataType, BetaDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, HElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerWmma, NPerWmma, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector, LayernormThreadClusterSize_M_N, LayernormThreadSliceSize_M, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::LayernormBlockTileSize_M_N
Initial value:
Sequence<LayernormThreadClusterSize_M_N::At(0) * LayernormThreadSliceSize_M,
LayernormThreadClusterSize_M_N::At(1) * LayernormThreadSliceSize_N>

◆ LayernormCountGridDesc_M_NBlock

template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
using ck::tensor_operation::device::DeviceGemmMultipleDLayernorm_Wmma_CShuffleV3< ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, DsDataType, HDataType, AccDataType, CShuffleDataType, EMeanVarDataType, GammaDataType, BetaDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, HElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerWmma, NPerWmma, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector, LayernormThreadClusterSize_M_N, LayernormThreadSliceSize_M, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::LayernormCountGridDesc_M_NBlock
Initial value:
decltype(GridwiseGemmWelford::EpilogueWelfordCShuffle::template MakeCountDescriptor_M_N<

◆ LayernormMeanVarGridDesc_M_NBlock

template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
using ck::tensor_operation::device::DeviceGemmMultipleDLayernorm_Wmma_CShuffleV3< ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, DsDataType, HDataType, AccDataType, CShuffleDataType, EMeanVarDataType, GammaDataType, BetaDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, HElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerWmma, NPerWmma, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector, LayernormThreadClusterSize_M_N, LayernormThreadSliceSize_M, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::LayernormMeanVarGridDesc_M_NBlock
Initial value:
decltype(GridwiseGemmWelford::EpilogueWelfordCShuffle::template MakeMeanVarDescriptor_M_N<

Member Function Documentation

◆ GetTypeString()

template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
std::string ck::tensor_operation::device::DeviceGemmMultipleDLayernorm_Wmma_CShuffleV3< ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, DsDataType, HDataType, AccDataType, CShuffleDataType, EMeanVarDataType, GammaDataType, BetaDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, HElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerWmma, NPerWmma, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector, LayernormThreadClusterSize_M_N, LayernormThreadSliceSize_M, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::GetTypeString ( ) const
inlineoverridevirtual

◆ GetWorkSpaceSize()

template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
size_t ck::tensor_operation::device::DeviceGemmMultipleDLayernorm_Wmma_CShuffleV3< ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, DsDataType, HDataType, AccDataType, CShuffleDataType, EMeanVarDataType, GammaDataType, BetaDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, HElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerWmma, NPerWmma, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector, LayernormThreadClusterSize_M_N, LayernormThreadSliceSize_M, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::GetWorkSpaceSize ( const BaseArgument * pArg) const
inlineoverridevirtual

◆ IsSupportedArgument() [1/2]

template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
bool ck::tensor_operation::device::DeviceGemmMultipleDLayernorm_Wmma_CShuffleV3< ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, DsDataType, HDataType, AccDataType, CShuffleDataType, EMeanVarDataType, GammaDataType, BetaDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, HElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerWmma, NPerWmma, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector, LayernormThreadClusterSize_M_N, LayernormThreadSliceSize_M, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::IsSupportedArgument ( const Argument & arg)
inlinestatic

◆ IsSupportedArgument() [2/2]

template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
bool ck::tensor_operation::device::DeviceGemmMultipleDLayernorm_Wmma_CShuffleV3< ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, DsDataType, HDataType, AccDataType, CShuffleDataType, EMeanVarDataType, GammaDataType, BetaDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, HElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerWmma, NPerWmma, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector, LayernormThreadClusterSize_M_N, LayernormThreadSliceSize_M, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::IsSupportedArgument ( const BaseArgument * p_arg)
inlineoverridevirtual

◆ MakeArgument()

template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
auto ck::tensor_operation::device::DeviceGemmMultipleDLayernorm_Wmma_CShuffleV3< ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, DsDataType, HDataType, AccDataType, CShuffleDataType, EMeanVarDataType, GammaDataType, BetaDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, HElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerWmma, NPerWmma, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector, LayernormThreadClusterSize_M_N, LayernormThreadSliceSize_M, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::MakeArgument ( const void * p_a,
const void * p_b,
std::array< const void *, NumDTensor > p_ds,
const void * p_gamma,
const void * p_beta,
void * p_h,
index_t MRaw,
index_t NRaw,
index_t KRaw,
index_t StrideA,
index_t StrideB,
std::array< index_t, NumDTensor > StrideDs,
index_t StrideH,
double epsilon,
AElementwiseOperation a_element_op,
BElementwiseOperation b_element_op,
CDEElementwiseOperation cde_element_op,
HElementwiseOperation h_element_op )
inlinestatic

◆ MakeArgumentPointer()

template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
std::unique_ptr< BaseArgument > ck::tensor_operation::device::DeviceGemmMultipleDLayernorm_Wmma_CShuffleV3< ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, DsDataType, HDataType, AccDataType, CShuffleDataType, EMeanVarDataType, GammaDataType, BetaDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, HElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerWmma, NPerWmma, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector, LayernormThreadClusterSize_M_N, LayernormThreadSliceSize_M, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::MakeArgumentPointer ( const void * p_a,
const void * p_b,
std::array< const void *, NumDTensor > p_ds,
const void * p_gamma,
const void * p_beta,
void * p_h,
index_t MRaw,
index_t NRaw,
index_t KRaw,
index_t StrideA,
index_t StrideB,
std::array< index_t, NumDTensor > StrideDs,
index_t StrideH,
double epsilon,
AElementwiseOperation a_element_op,
BElementwiseOperation b_element_op,
CDEElementwiseOperation cde_element_op,
HElementwiseOperation h_element_op )
inlineoverridevirtual

◆ MakeDescriptor_X()

template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
template<index_t XPerTile>
auto ck::tensor_operation::device::DeviceGemmMultipleDLayernorm_Wmma_CShuffleV3< ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, DsDataType, HDataType, AccDataType, CShuffleDataType, EMeanVarDataType, GammaDataType, BetaDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, HElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerWmma, NPerWmma, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector, LayernormThreadClusterSize_M_N, LayernormThreadSliceSize_M, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::MakeDescriptor_X ( index_t X)
inlinestatic

◆ MakeEHGridDescriptor_M_N()

template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
template<typename DoPads, index_t MPerTile, index_t NPerTile>
auto ck::tensor_operation::device::DeviceGemmMultipleDLayernorm_Wmma_CShuffleV3< ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, DsDataType, HDataType, AccDataType, CShuffleDataType, EMeanVarDataType, GammaDataType, BetaDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, HElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerWmma, NPerWmma, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector, LayernormThreadClusterSize_M_N, LayernormThreadSliceSize_M, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::MakeEHGridDescriptor_M_N ( index_t M,
index_t N,
index_t Stride )
inlinestatic

◆ MakeInvoker()

template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
auto ck::tensor_operation::device::DeviceGemmMultipleDLayernorm_Wmma_CShuffleV3< ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, DsDataType, HDataType, AccDataType, CShuffleDataType, EMeanVarDataType, GammaDataType, BetaDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, HElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerWmma, NPerWmma, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector, LayernormThreadClusterSize_M_N, LayernormThreadSliceSize_M, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::MakeInvoker ( )
inlinestatic

◆ MakeInvokerPointer()

template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
std::unique_ptr< BaseInvoker > ck::tensor_operation::device::DeviceGemmMultipleDLayernorm_Wmma_CShuffleV3< ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, DsDataType, HDataType, AccDataType, CShuffleDataType, EMeanVarDataType, GammaDataType, BetaDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, HElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerWmma, NPerWmma, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector, LayernormThreadClusterSize_M_N, LayernormThreadSliceSize_M, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::MakeInvokerPointer ( )
inlineoverridevirtual

◆ SetWorkSpacePointer()

template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
void ck::tensor_operation::device::DeviceGemmMultipleDLayernorm_Wmma_CShuffleV3< ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, DsDataType, HDataType, AccDataType, CShuffleDataType, EMeanVarDataType, GammaDataType, BetaDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, HElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerWmma, NPerWmma, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector, LayernormThreadClusterSize_M_N, LayernormThreadSliceSize_M, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::SetWorkSpacePointer ( BaseArgument * pArg,
void * p_workspace,
const StreamConfig & = StreamConfig{} ) const
inlineoverridevirtual

Member Data Documentation

◆ I0

template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
auto ck::tensor_operation::device::DeviceGemmMultipleDLayernorm_Wmma_CShuffleV3< ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, DsDataType, HDataType, AccDataType, CShuffleDataType, EMeanVarDataType, GammaDataType, BetaDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, HElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerWmma, NPerWmma, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector, LayernormThreadClusterSize_M_N, LayernormThreadSliceSize_M, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::I0 = Number<0>{}
staticconstexpr

◆ I1

template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
auto ck::tensor_operation::device::DeviceGemmMultipleDLayernorm_Wmma_CShuffleV3< ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, DsDataType, HDataType, AccDataType, CShuffleDataType, EMeanVarDataType, GammaDataType, BetaDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, HElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerWmma, NPerWmma, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector, LayernormThreadClusterSize_M_N, LayernormThreadSliceSize_M, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::I1 = Number<1>{}
staticconstexpr

◆ I2

template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
auto ck::tensor_operation::device::DeviceGemmMultipleDLayernorm_Wmma_CShuffleV3< ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, DsDataType, HDataType, AccDataType, CShuffleDataType, EMeanVarDataType, GammaDataType, BetaDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, HElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerWmma, NPerWmma, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector, LayernormThreadClusterSize_M_N, LayernormThreadSliceSize_M, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::I2 = Number<2>{}
staticconstexpr

◆ I3

template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
auto ck::tensor_operation::device::DeviceGemmMultipleDLayernorm_Wmma_CShuffleV3< ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, DsDataType, HDataType, AccDataType, CShuffleDataType, EMeanVarDataType, GammaDataType, BetaDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, HElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerWmma, NPerWmma, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector, LayernormThreadClusterSize_M_N, LayernormThreadSliceSize_M, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::I3 = Number<3>{}
staticconstexpr

◆ LayernormBetaSrcVectorSize

template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
index_t ck::tensor_operation::device::DeviceGemmMultipleDLayernorm_Wmma_CShuffleV3< ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, DsDataType, HDataType, AccDataType, CShuffleDataType, EMeanVarDataType, GammaDataType, BetaDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, HElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerWmma, NPerWmma, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector, LayernormThreadClusterSize_M_N, LayernormThreadSliceSize_M, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::LayernormBetaSrcVectorSize = CDEShuffleBlockTransferScalarPerVector
staticconstexpr

◆ LayernormESrcVectorSize

template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
index_t ck::tensor_operation::device::DeviceGemmMultipleDLayernorm_Wmma_CShuffleV3< ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, DsDataType, HDataType, AccDataType, CShuffleDataType, EMeanVarDataType, GammaDataType, BetaDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, HElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerWmma, NPerWmma, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector, LayernormThreadClusterSize_M_N, LayernormThreadSliceSize_M, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::LayernormESrcVectorSize = CDEShuffleBlockTransferScalarPerVector
staticconstexpr

◆ LayernormGammaSrcVectorSize

template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
index_t ck::tensor_operation::device::DeviceGemmMultipleDLayernorm_Wmma_CShuffleV3< ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, DsDataType, HDataType, AccDataType, CShuffleDataType, EMeanVarDataType, GammaDataType, BetaDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, HElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerWmma, NPerWmma, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector, LayernormThreadClusterSize_M_N, LayernormThreadSliceSize_M, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::LayernormGammaSrcVectorSize = CDEShuffleBlockTransferScalarPerVector
staticconstexpr

◆ LayernormHDstVectorSize

template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
index_t ck::tensor_operation::device::DeviceGemmMultipleDLayernorm_Wmma_CShuffleV3< ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, DsDataType, HDataType, AccDataType, CShuffleDataType, EMeanVarDataType, GammaDataType, BetaDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, HElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerWmma, NPerWmma, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector, LayernormThreadClusterSize_M_N, LayernormThreadSliceSize_M, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::LayernormHDstVectorSize = CDEShuffleBlockTransferScalarPerVector
staticconstexpr

◆ LayernormThreadSliceSize_N

template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
index_t ck::tensor_operation::device::DeviceGemmMultipleDLayernorm_Wmma_CShuffleV3< ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, DsDataType, HDataType, AccDataType, CShuffleDataType, EMeanVarDataType, GammaDataType, BetaDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, HElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerWmma, NPerWmma, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector, LayernormThreadClusterSize_M_N, LayernormThreadSliceSize_M, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::LayernormThreadSliceSize_N = CDEShuffleBlockTransferScalarPerVector
staticconstexpr

◆ NumDTensor

template<typename ALayout, typename BLayout, typename DsLayout, typename HLayout, typename ADataType, typename BDataType, typename DsDataType, typename HDataType, typename AccDataType, typename CShuffleDataType, typename EMeanVarDataType, typename GammaDataType, typename BetaDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename HElementwiseOperation, GemmSpecialization GemmSpec, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEShuffleBlockTransferScalarPerVector, typename LayernormThreadClusterSize_M_N, index_t LayernormThreadSliceSize_M, BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1, typename ComputeTypeA = HDataType, typename ComputeTypeB = ComputeTypeA, bool PermuteA = false, bool PermuteB = false>
index_t ck::tensor_operation::device::DeviceGemmMultipleDLayernorm_Wmma_CShuffleV3< ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, DsDataType, HDataType, AccDataType, CShuffleDataType, EMeanVarDataType, GammaDataType, BetaDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, HElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerWmma, NPerWmma, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector, LayernormThreadClusterSize_M_N, LayernormThreadSliceSize_M, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::NumDTensor = DsDataType::Size()
staticconstexpr

The documentation for this struct was generated from the following file: