DeviceGroupedQueryAttentionForward_Wmma< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, QueryGroupNumber, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer > Struct Template Reference

DeviceGroupedQueryAttentionForward_Wmma&lt; NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, QueryGroupNumber, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer &gt; Struct Template Reference#

Composable Kernel: ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, QueryGroupNumber, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer > Struct Template Reference
ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, QueryGroupNumber, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer > Struct Template Reference

#include <device_grouped_query_attention_forward_wmma.hpp>

Inheritance diagram for ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, QueryGroupNumber, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >:
ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc1BiasDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, MaskingSpec > ck::tensor_operation::device::BaseOperator

Classes

struct  ComputeBasePtrOfStridedBatch
struct  RawArg
struct  Argument
struct  Invoker

Public Types

using DeviceOp = DeviceGroupedQueryAttentionForward_Wmma
using Transform
using AGridDesc = decltype(MakeAGridDescriptor({}, {}))
using B0GridDesc = decltype(MakeB0GridDescriptor({}, {}))
using B1GridDesc = decltype(MakeB1GridDescriptor({}, {}))
using CGridDesc_M_N = decltype(Transform::MakeCGridDescriptor_M_N({}, {}))
using AGridDesc_G_M_K = decltype(Transform::MakeAGridDescriptor_G_M_K({}, {}))
using B0GridDesc_G_L_K = decltype(Transform::MakeB0GridDescriptor_G_N_K({}, {}))
using B1GridDesc_G_N_L = decltype(Transform::MakeB1GridDescriptor_G_N_K({}, {}))
using CGridDesc_G_M_N = decltype(Transform::MakeCGridDescriptor_G_M_N({}, {}))
using C0MatrixMask = C0MatrixMask_impl<decltype(make_MaskOutPredicate())>
using GridwiseOp

Public Member Functions

bool IsSupportedArgument (const BaseArgument *p_arg) override
std::unique_ptr< BaseArgumentMakeArgumentPointer (const void *p_a, const void *p_b0, const void *p_b1, void *p_c, const std::array< void *, NumAcc0Bias > p_acc0_biases, const std::array< void *, NumAcc1Bias > p_acc1_biases, const std::vector< index_t > &a_gs_ms_ks_lengths, const std::vector< index_t > &a_gs_ms_ks_strides, const std::vector< index_t > &b0_gs_ls_ks_lengths, const std::vector< index_t > &b0_gs_ls_ks_strides, const std::vector< index_t > &b1_gs_ns_ls_lengths, const std::vector< index_t > &b1_gs_ns_ls_strides, const std::vector< index_t > &c_gs_ms_ns_lengths, const std::vector< index_t > &c_gs_ms_ns_strides, const std::array< std::vector< ck::index_t >, NumAcc0Bias > acc0_biases_gs_ms_ls_lengths, const std::array< std::vector< ck::index_t >, NumAcc0Bias > acc0_biases_gs_ms_ls_strides, const std::array< std::vector< ck::index_t >, NumAcc1Bias > acc1_biases_gs_ms_ns_lengths, const std::array< std::vector< ck::index_t >, NumAcc1Bias > acc1_biases_gs_ms_ns_strides, AElementwiseOperation a_element_op, B0ElementwiseOperation b0_element_op, AccElementwiseOperation acc_element_op, B1ElementwiseOperation b1_element_op, CElementwiseOperation c_element_op) override
std::unique_ptr< BaseInvokerMakeInvokerPointer () override
std::string GetTypeString () const override
Public Member Functions inherited from ck::tensor_operation::device::BaseOperator
 BaseOperator ()=default
 BaseOperator (const BaseOperator &)=default
BaseOperatoroperator= (const BaseOperator &)=default
virtual std::string GetInstanceString () const
virtual std::string GetTypeIdName () const
virtual std::optional< std::string > GetObjectName () const
virtual std::optional< std::string > GetTemplateInfo () const
virtual std::string GetTypeIdHashCode () const
virtual size_t GetWorkSpaceSize (const BaseArgument *) const
virtual void SetWorkSpacePointer (BaseArgument *p_arg, void *p_workspace, const StreamConfig &=StreamConfig{}) const
virtual ~BaseOperator ()

Static Public Member Functions

__host__ static __device__ auto MakeAGridDescriptor (const std::array< index_t, NumDimG+NumDimM+NumDimN > &a_gs_ms_ks_lengths_vec, const std::array< index_t, NumDimG+NumDimM+NumDimN > &a_gs_ms_ks_strides_vec)
__host__ static __device__ auto MakeB0GridDescriptor (const std::array< index_t, NumDimG+NumDimM+NumDimN > &b0_gs_ls_ks_lengths_vec, const std::array< index_t, NumDimG+NumDimM+NumDimN > &b0_gs_ls_ks_strides_vec)
__host__ static __device__ auto MakeB1GridDescriptor (const std::array< index_t, NumDimG+NumDimM+NumDimN > &b1_gs_ns_ls_lengths_vec, const std::array< index_t, NumDimG+NumDimM+NumDimN > &b1_gs_ns_ls_strides_vec)
__host__ __device__ static constexpr auto make_MaskOutPredicate ()
static auto MakeArgument (const ADataType *p_a, const B0DataType *p_b0, const B1DataType *p_b1, CDataType *p_c, index_t M, index_t N, index_t K, index_t O, index_t G0, index_t G1, float alpha, bool input_permute, bool output_permute)
static bool IsSupportedArgument (const RawArg &arg)
static constexpr bool IsValidCompilationParameter ()
static auto MakeInvoker ()

Static Public Attributes

static constexpr index_t NumAcc0Bias = Acc0BiasDataType::Size()
static constexpr index_t NumAcc1Bias = Acc1BiasDataType::Size()
static constexpr index_t NumDimGemm0M = NumDimM
static constexpr index_t NumDimGemm0N = NumDimL
static constexpr index_t NumDimGemm0K = NumDimK
static constexpr index_t NumDimGemm1M = NumDimM
static constexpr index_t NumDimGemm1N = NumDimN
static constexpr index_t NumDimGemm1K = NumDimL
static constexpr auto I0 = Number<0>{}
static constexpr auto I1 = Number<1>{}
static constexpr auto I2 = Number<2>{}
static constexpr auto I3 = Number<3>{}
static constexpr auto I4 = Number<4>{}
static constexpr auto I5 = Number<5>{}
static constexpr auto I6 = Number<6>{}
static constexpr auto WmmaK = 16
static constexpr auto MWaves = MPerBlock / (MRepeat * MPerWmma)
static constexpr auto LWaves = LPerBlock / (LRepeat * LPerWmma)
static constexpr auto NWaves = NPerBlock / (NRepeat * NPerWmma)
static constexpr auto AEnableLds_auto = LWaves == 1 ? false : true
static constexpr auto B0EnableLds_auto = MWaves == 1 ? false : true
static constexpr auto B1EnableLds_auto = MWaves == 1 ? false : true
static constexpr auto AEnableLds_manu = false
static constexpr auto B0EnableLds_manu = true
static constexpr auto B1EnableLds_manu = true
static constexpr auto AEnableLds = AEnableLds_auto || AEnableLds_manu || (NumPrefetch > 1)
static constexpr auto B0EnableLds = B0EnableLds_auto || B0EnableLds_manu || (NumPrefetch > 1)
static constexpr auto B1EnableLds = B1EnableLds_auto || B1EnableLds_manu || (NumPrefetch > 1)
Static Public Attributes inherited from ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc1BiasDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, MaskingSpec >
static constexpr index_t NumAcc0Bias
static constexpr index_t NumAcc1Bias

Member Typedef Documentation

◆ AGridDesc

template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t QueryGroupNumber, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
using ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, QueryGroupNumber, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::AGridDesc = decltype(MakeAGridDescriptor({}, {}))

◆ AGridDesc_G_M_K

template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t QueryGroupNumber, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
using ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, QueryGroupNumber, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::AGridDesc_G_M_K = decltype(Transform::MakeAGridDescriptor_G_M_K({}, {}))

◆ B0GridDesc

template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t QueryGroupNumber, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
using ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, QueryGroupNumber, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::B0GridDesc = decltype(MakeB0GridDescriptor({}, {}))

◆ B0GridDesc_G_L_K

template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t QueryGroupNumber, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
using ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, QueryGroupNumber, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::B0GridDesc_G_L_K = decltype(Transform::MakeB0GridDescriptor_G_N_K({}, {}))

◆ B1GridDesc

template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t QueryGroupNumber, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
using ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, QueryGroupNumber, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::B1GridDesc = decltype(MakeB1GridDescriptor({}, {}))

◆ B1GridDesc_G_N_L

template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t QueryGroupNumber, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
using ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, QueryGroupNumber, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::B1GridDesc_G_N_L = decltype(Transform::MakeB1GridDescriptor_G_N_K({}, {}))

◆ C0MatrixMask

template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t QueryGroupNumber, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
using ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, QueryGroupNumber, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::C0MatrixMask = C0MatrixMask_impl<decltype(make_MaskOutPredicate())>

◆ CGridDesc_G_M_N

template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t QueryGroupNumber, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
using ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, QueryGroupNumber, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::CGridDesc_G_M_N = decltype(Transform::MakeCGridDescriptor_G_M_N({}, {}))

◆ CGridDesc_M_N

template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t QueryGroupNumber, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
using ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, QueryGroupNumber, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::CGridDesc_M_N = decltype(Transform::MakeCGridDescriptor_M_N({}, {}))

◆ DeviceOp

template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t QueryGroupNumber, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
using ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, QueryGroupNumber, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::DeviceOp = DeviceGroupedQueryAttentionForward_Wmma

◆ GridwiseOp

template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t QueryGroupNumber, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
using ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, QueryGroupNumber, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::GridwiseOp

◆ Transform

template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t QueryGroupNumber, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
using ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, QueryGroupNumber, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::Transform

Member Function Documentation

◆ GetTypeString()

template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t QueryGroupNumber, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
std::string ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, QueryGroupNumber, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::GetTypeString ( ) const
inlineoverridevirtual

◆ IsSupportedArgument() [1/2]

template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t QueryGroupNumber, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
bool ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, QueryGroupNumber, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::IsSupportedArgument ( const BaseArgument * p_arg)
inlineoverridevirtual

◆ IsSupportedArgument() [2/2]

template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t QueryGroupNumber, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
bool ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, QueryGroupNumber, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::IsSupportedArgument ( const RawArg & arg)
inlinestatic

◆ IsValidCompilationParameter()

template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t QueryGroupNumber, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
constexpr bool ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, QueryGroupNumber, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::IsValidCompilationParameter ( )
inlinestaticconstexpr

◆ make_MaskOutPredicate()

template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t QueryGroupNumber, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
__host__ __device__ static constexpr auto ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, QueryGroupNumber, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::make_MaskOutPredicate ( )
inlinestaticconstexpr

◆ MakeAGridDescriptor()

template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t QueryGroupNumber, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
__host__ static __device__ auto ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, QueryGroupNumber, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::MakeAGridDescriptor ( const std::array< index_t, NumDimG+NumDimM+NumDimN > & a_gs_ms_ks_lengths_vec,
const std::array< index_t, NumDimG+NumDimM+NumDimN > & a_gs_ms_ks_strides_vec )
inlinestatic

◆ MakeArgument()

template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t QueryGroupNumber, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
auto ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, QueryGroupNumber, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::MakeArgument ( const ADataType * p_a,
const B0DataType * p_b0,
const B1DataType * p_b1,
CDataType * p_c,
index_t M,
index_t N,
index_t K,
index_t O,
index_t G0,
index_t G1,
float alpha,
bool input_permute,
bool output_permute )
inlinestatic

◆ MakeArgumentPointer()

template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t QueryGroupNumber, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
std::unique_ptr< BaseArgument > ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, QueryGroupNumber, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::MakeArgumentPointer ( const void * p_a,
const void * p_b0,
const void * p_b1,
void * p_c,
const std::array< void *, NumAcc0Bias > p_acc0_biases,
const std::array< void *, NumAcc1Bias > p_acc1_biases,
const std::vector< index_t > & a_gs_ms_ks_lengths,
const std::vector< index_t > & a_gs_ms_ks_strides,
const std::vector< index_t > & b0_gs_ls_ks_lengths,
const std::vector< index_t > & b0_gs_ls_ks_strides,
const std::vector< index_t > & b1_gs_ns_ls_lengths,
const std::vector< index_t > & b1_gs_ns_ls_strides,
const std::vector< index_t > & c_gs_ms_ns_lengths,
const std::vector< index_t > & c_gs_ms_ns_strides,
const std::array< std::vector< ck::index_t >, NumAcc0Bias > acc0_biases_gs_ms_ls_lengths,
const std::array< std::vector< ck::index_t >, NumAcc0Bias > acc0_biases_gs_ms_ls_strides,
const std::array< std::vector< ck::index_t >, NumAcc1Bias > acc1_biases_gs_ms_ns_lengths,
const std::array< std::vector< ck::index_t >, NumAcc1Bias > acc1_biases_gs_ms_ns_strides,
AElementwiseOperation a_element_op,
B0ElementwiseOperation b0_element_op,
AccElementwiseOperation acc_element_op,
B1ElementwiseOperation b1_element_op,
CElementwiseOperation c_element_op )
inlineoverridevirtual

◆ MakeB0GridDescriptor()

template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t QueryGroupNumber, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
__host__ static __device__ auto ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, QueryGroupNumber, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::MakeB0GridDescriptor ( const std::array< index_t, NumDimG+NumDimM+NumDimN > & b0_gs_ls_ks_lengths_vec,
const std::array< index_t, NumDimG+NumDimM+NumDimN > & b0_gs_ls_ks_strides_vec )
inlinestatic

◆ MakeB1GridDescriptor()

template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t QueryGroupNumber, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
__host__ static __device__ auto ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, QueryGroupNumber, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::MakeB1GridDescriptor ( const std::array< index_t, NumDimG+NumDimM+NumDimN > & b1_gs_ns_ls_lengths_vec,
const std::array< index_t, NumDimG+NumDimM+NumDimN > & b1_gs_ns_ls_strides_vec )
inlinestatic

◆ MakeInvoker()

template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t QueryGroupNumber, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
auto ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, QueryGroupNumber, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::MakeInvoker ( )
inlinestatic

◆ MakeInvokerPointer()

template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t QueryGroupNumber, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
std::unique_ptr< BaseInvoker > ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, QueryGroupNumber, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::MakeInvokerPointer ( )
inlineoverridevirtual

Member Data Documentation

◆ AEnableLds

template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t QueryGroupNumber, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
auto ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, QueryGroupNumber, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::AEnableLds = AEnableLds_auto || AEnableLds_manu || (NumPrefetch > 1)
staticconstexpr

◆ AEnableLds_auto

template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t QueryGroupNumber, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
auto ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, QueryGroupNumber, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::AEnableLds_auto = LWaves == 1 ? false : true
staticconstexpr

◆ AEnableLds_manu

template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t QueryGroupNumber, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
auto ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, QueryGroupNumber, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::AEnableLds_manu = false
staticconstexpr

◆ B0EnableLds

template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t QueryGroupNumber, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
auto ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, QueryGroupNumber, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::B0EnableLds = B0EnableLds_auto || B0EnableLds_manu || (NumPrefetch > 1)
staticconstexpr

◆ B0EnableLds_auto

template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t QueryGroupNumber, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
auto ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, QueryGroupNumber, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::B0EnableLds_auto = MWaves == 1 ? false : true
staticconstexpr

◆ B0EnableLds_manu

template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t QueryGroupNumber, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
auto ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, QueryGroupNumber, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::B0EnableLds_manu = true
staticconstexpr

◆ B1EnableLds

template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t QueryGroupNumber, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
auto ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, QueryGroupNumber, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::B1EnableLds = B1EnableLds_auto || B1EnableLds_manu || (NumPrefetch > 1)
staticconstexpr

◆ B1EnableLds_auto

template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t QueryGroupNumber, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
auto ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, QueryGroupNumber, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::B1EnableLds_auto = MWaves == 1 ? false : true
staticconstexpr

◆ B1EnableLds_manu

template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t QueryGroupNumber, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
auto ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, QueryGroupNumber, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::B1EnableLds_manu = true
staticconstexpr

◆ I0

template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t QueryGroupNumber, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
auto ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, QueryGroupNumber, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::I0 = Number<0>{}
staticconstexpr

◆ I1

template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t QueryGroupNumber, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
auto ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, QueryGroupNumber, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::I1 = Number<1>{}
staticconstexpr

◆ I2

template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t QueryGroupNumber, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
auto ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, QueryGroupNumber, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::I2 = Number<2>{}
staticconstexpr

◆ I3

template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t QueryGroupNumber, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
auto ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, QueryGroupNumber, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::I3 = Number<3>{}
staticconstexpr

◆ I4

template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t QueryGroupNumber, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
auto ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, QueryGroupNumber, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::I4 = Number<4>{}
staticconstexpr

◆ I5

template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t QueryGroupNumber, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
auto ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, QueryGroupNumber, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::I5 = Number<5>{}
staticconstexpr

◆ I6

template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t QueryGroupNumber, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
auto ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, QueryGroupNumber, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::I6 = Number<6>{}
staticconstexpr

◆ LWaves

template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t QueryGroupNumber, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
auto ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, QueryGroupNumber, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::LWaves = LPerBlock / (LRepeat * LPerWmma)
staticconstexpr

◆ MWaves

template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t QueryGroupNumber, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
auto ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, QueryGroupNumber, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::MWaves = MPerBlock / (MRepeat * MPerWmma)
staticconstexpr

◆ NumAcc0Bias

template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t QueryGroupNumber, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
index_t ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, QueryGroupNumber, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::NumAcc0Bias = Acc0BiasDataType::Size()
staticconstexpr

◆ NumAcc1Bias

template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t QueryGroupNumber, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
index_t ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, QueryGroupNumber, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::NumAcc1Bias = Acc1BiasDataType::Size()
staticconstexpr

◆ NumDimGemm0K

template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t QueryGroupNumber, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
index_t ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, QueryGroupNumber, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::NumDimGemm0K = NumDimK
staticconstexpr

◆ NumDimGemm0M

template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t QueryGroupNumber, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
index_t ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, QueryGroupNumber, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::NumDimGemm0M = NumDimM
staticconstexpr

◆ NumDimGemm0N

template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t QueryGroupNumber, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
index_t ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, QueryGroupNumber, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::NumDimGemm0N = NumDimL
staticconstexpr

◆ NumDimGemm1K

template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t QueryGroupNumber, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
index_t ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, QueryGroupNumber, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::NumDimGemm1K = NumDimL
staticconstexpr

◆ NumDimGemm1M

template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t QueryGroupNumber, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
index_t ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, QueryGroupNumber, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::NumDimGemm1M = NumDimM
staticconstexpr

◆ NumDimGemm1N

template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t QueryGroupNumber, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
index_t ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, QueryGroupNumber, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::NumDimGemm1N = NumDimN
staticconstexpr

◆ NWaves

template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t QueryGroupNumber, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
auto ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, QueryGroupNumber, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::NWaves = NPerBlock / (NRepeat * NPerWmma)
staticconstexpr

◆ WmmaK

template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t QueryGroupNumber, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
auto ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, QueryGroupNumber, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::WmmaK = 16
staticconstexpr

The documentation for this struct was generated from the following file: