ck_tile Namespace Reference

ck_tile Namespace Reference#

Composable Kernel: ck_tile Namespace Reference
ck_tile Namespace Reference

Namespaces

namespace  impl
namespace  detail
namespace  details
namespace  util
namespace  internal
namespace  literals
namespace  ReduceOp
namespace  conv
namespace  ranges
namespace  tensor_layout
namespace  element_wise
namespace  moe
namespace  reboot

Classes

struct  base_transform
struct  pass_through
struct  pad
struct  left_pad
struct  right_pad
struct  embed
struct  lambda_merge_generate_MagicDivision_calculate_magic_divisor
struct  merge_v2_magic_division
struct  merge_v3_division_mod
struct  unmerge
struct  freeze
struct  insert
struct  replicate
struct  slice
struct  modulo
struct  xor_t
struct  offset
struct  indexing
struct  indexing_adaptor_onshot_cached
struct  space_filling_curve
struct  tile_distribution_encoding_pattern
struct  tile_distribution_encoding_pattern_2d
 Class creating 2D static tile distribution with different load/store patterns. More...
struct  tile_distribution_encoding_pattern_2d< BlockSize, YPerTile, XPerTile, VecSize, tile_distribution_pattern::thread_raked, NumWaveGroups >
struct  tile_distribution_encoding_pattern_2d< BlockSize, YPerTile, XPerTile, VecSize, tile_distribution_pattern::warp_raked, NumWaveGroups >
struct  tile_distribution_encoding_pattern_2d< BlockSize, YPerTile, XPerTile, VecSize, tile_distribution_pattern::block_raked, NumWaveGroups >
struct  buffer_resource
struct  buffer_load
struct  buffer_load_if
struct  buffer_store
struct  buffer_store_if
struct  buffer_load< 16, pre_nop >
struct  buffer_load< 8, pre_nop >
struct  buffer_load< 4, pre_nop >
struct  buffer_load< 2, pre_nop >
struct  buffer_load< 1, pre_nop >
struct  buffer_load_if< 16, pre_nop >
struct  buffer_load_if< 8, pre_nop >
struct  buffer_load_if< 4, pre_nop >
struct  buffer_load_if< 2, pre_nop >
struct  buffer_load_if< 1, pre_nop >
struct  buffer_store< 16 >
struct  buffer_store< 8 >
struct  buffer_store< 4 >
struct  buffer_store< 2 >
struct  buffer_store< 1 >
struct  buffer_store_if< 16 >
struct  buffer_store_if< 8 >
struct  buffer_store_if< 4 >
struct  buffer_store_if< 2 >
struct  buffer_store_if< 1 >
struct  buffer_atomic_add_if
struct  buffer_atomic_add_if< bf16_t, 2, pre_nop >
struct  buffer_atomic_add
struct  buffer_atomic_add< bf16_t, 2, pre_nop >
struct  smem_load
struct  smem_load< 16 >
struct  smem_load< 8 >
struct  smem_load< 4 >
struct  smem_load< 2 >
struct  smem_load< 1 >
struct  LaneGroupTransposeTraits
struct  LaneGroupTransposeTraits< T, LaneGroupSize, std::enable_if_t< sizeof(T)==2 > >
struct  LaneGroupTransposeTraits< T, LaneGroupSize, std::enable_if_t< sizeof(T)==1 > >
struct  safe_underlying_type
struct  safe_underlying_type< T, true >
struct  safe_underlying_type< T, false >
struct  WaitcntLayoutGfx12
struct  WaitcntLayoutGfx11
struct  WaitcntLayoutLegacy
struct  waitcnt_arg
struct  gfx9_t
struct  gfx950_t
struct  gfx103_t
struct  gfx11_t
struct  gfx12_t
struct  gfx_invalid_t
struct  workgroup_barrier
struct  array
 A fixed-size array container similar to std::array with additional utilities. More...
struct  array< T, 0 >
 Specialization of array container for zero elements. More...
struct  vector_traits
struct  vector_traits< array< T, N >, void >
struct  map
struct  meta_data_buffer
struct  sequence
struct  sequence_split
struct  sequence_reverse
struct  sequence_map_inverse
struct  is_valid_sequence_map
struct  sequence_merge
struct  sequence_merge< sequence< Xs... >, sequence< Ys... > >
struct  sequence_merge< Seq >
struct  sequence_gen
struct  arithmetic_sequence_gen
struct  arithmetic_sequence_gen< 0, IEnd, 1 >
struct  uniform_sequence_gen
struct  sequence_reverse_inclusive_scan
struct  sequence_reverse_inclusive_scan< sequence< I, Is... >, Reduce, Init >
struct  sequence_reverse_inclusive_scan< sequence< I >, Reduce, Init >
struct  sequence_reverse_inclusive_scan< sequence<>, Reduce, Init >
struct  sequence_reverse< sequence< Ns... > >
struct  sequence_reduce
struct  sequence_reduce< Reduce, sequence< Xs... >, sequence< Ys... > >
struct  sequence_reduce< Reduce, Seq >
struct  sequence_sort_impl
struct  sequence_sort_impl< sequence< ValueX, ValueY >, sequence< IdX, IdY >, Compare >
struct  sequence_sort_impl< sequence< Value >, sequence< Id >, Compare >
struct  sequence_sort_impl< sequence<>, sequence<>, Compare >
struct  sequence_sort
struct  sequence_unique_sort
struct  sequence_exclusive_scan
struct  sequence_exclusive_scan< sequence< Xs... >, sequence< Y, Ys... >, Reduce >
struct  sequence_exclusive_scan< sequence< Xs... >, sequence< Y >, Reduce >
struct  sequence_exclusive_scan< sequence< Xs... >, sequence<>, Reduce >
struct  tuple
class  span
struct  vector_traits< tuple< T... >, void >
struct  tuple_concat
struct  tuple_concat< tuple< Xs... >, tuple< Ys... > >
struct  numeric
struct  numeric< bfloat16_t >
struct  numeric_traits< bfloat16_t >
struct  e8m0_bexp_t
 Unsigned representation of a conventional biased Float32 exponent. More...
struct  numeric_traits< e8m0_t >
struct  numeric< e8m0_t >
struct  numeric_traits< fp8_t >
struct  numeric_traits< bf8_t >
struct  numeric< fp8_t >
struct  numeric< bf8_t >
struct  numeric< half_t >
struct  numeric_traits< half_t >
struct  numeric< int8_t >
struct  constant
struct  integral_constant
struct  is_constant
struct  is_constant< constant< v > >
struct  scales_c
struct  scales
struct  plus
struct  plus< void, void >
struct  minus
struct  minus< void, void >
struct  multiplies
struct  multiplies< void, void >
struct  maximize
struct  minimize
struct  integer_divide_ceiler
struct  equal
struct  equal< void, void >
struct  equal< float, float >
struct  equal< double, double >
struct  less
struct  less< void, void >
struct  less_equal
struct  less_equal< void, void >
struct  less_equal< float, float >
struct  less_equal< double, double >
struct  log2e
struct  log2e< double >
struct  log2e< float >
struct  numeric_utils
struct  null_type
struct  numeric_traits
struct  numeric_traits< float >
struct  pk_float4_e2m1_t
struct  numeric_traits< pk_fp4_t >
struct  numeric< pk_fp4_t >
struct  pk_int4_t
struct  numeric< pk_int4_t >
struct  numeric_traits< pk_int4_t >
struct  native_t
struct  vector_traits< T, void >
struct  buffer_view
struct  buffer_view< address_space_enum::generic, T, BufferSizeType, InvalidElementUseNumericalZeroValue, amd_buffer_coherence_enum::coherence_default >
struct  buffer_view< address_space_enum::global, T, BufferSizeType, InvalidElementUseNumericalZeroValue, Coherence >
struct  buffer_view< address_space_enum::lds, T, BufferSizeType, InvalidElementUseNumericalZeroValue, amd_buffer_coherence_enum::coherence_default >
struct  buffer_view< address_space_enum::vgpr, T, BufferSizeType, InvalidElementUseNumericalZeroValue, amd_buffer_coherence_enum::coherence_default >
struct  DefaultTranspose
struct  TransposeTileDistrChecker
struct  TransposeTileDistributionTraits
struct  null_tensor
struct  null_tile_window
struct  static_distributed_tensor
struct  tile_sweeper
struct  tensor_adaptor
struct  lambda_get_up_dim_num
struct  tensor_adaptor_coordinate
struct  tensor_coordinate
struct  tensor_descriptor
struct  tensor_view
struct  null_tensor_view
struct  tile_distributed_span
struct  tile_distributed_index
struct  tile_distribution
struct  tile_distribution_encoding
class  tile_distribution_encoding_shuffle
class  tile_distribution_encoding_shuffle< encoding, sequence< shuffle... > >
struct  tile_scatter_gather
 This class provides tile (windowed) view and access to the device memory. More...
struct  tile_window_with_static_distribution
 This class provides tile (windowed) view and access to the device memory. More...
struct  tile_window_with_static_lengths
 This class provides description of tile windowed view on the device memory. More...
struct  is_tile_window_with_static_distribution
 Type trait to determine if a type is a tile window with static distribution. More...
struct  is_tile_window_with_static_distribution< tile_window_with_static_distribution< BottomTensorView_, WindowLengths_, StaticTileDistribution_, NumCoord > >
 Specialization for tile_window_with_static_distribution to evaluate to true_type. More...
struct  is_tile_window_with_static_lengths
 Type trait to determine if a type is a tile window with static lengths. More...
struct  is_tile_window_with_static_lengths< tile_window_with_static_lengths< BottomTensorView_, WindowLengths_ > >
 Specialization for tile_window_with_static_lengths to evaluate to true_type. More...
struct  tile_window_base
 This class provides description of tile windowed view on the device memory. More...
struct  tile_window_with_tile_dstr_base
struct  tile_window_linear
struct  is_tile_window_linear
 Type trait to determine if a type is a linear tile window. More...
struct  is_tile_window_linear< tile_window_linear< BottomTensorView_, WindowLengths_, StaticTileDistribution_, LinearBottomDims_ > >
 Specialization of is_tile_window_linear for tile_window_linear. More...
struct  str_literal
struct  thread_buffer
struct  CK_PRINTF
struct  CK_PRINTF< ConvertTo, str_literal< FMTChars... >, str_literal< PREFIXChars... >, str_literal< SUFFIXChars... > >
struct  CK_PRINTF_WARP0
struct  AsmScopeMarker
struct  static_for
struct  static_for< 0, N, 1 >
struct  identity
struct  static_ford
struct  static_uford
struct  magic_division32_bit_range
struct  magic_division16_bit_range
struct  mdiv
struct  mdiv2
class  philox
struct  prand_generator_t
struct  prand_generator_t< float, seed_ >
struct  prand_generator_t< half_t, seed_ >
struct  AccumulateWithIndex
 Accumulate with index tracking reductions, provides deterministic first occurring index. More...
struct  Accumulate
struct  static_counter
struct  transpose_vectors
struct  copy_const
struct  copy_const< const From, To >
struct  nonesuch
struct  is_any_of
struct  is_any_of< CompareTo, FirstType >
struct  is_any_of< CompareTo, FirstType, Rest... >
struct  is_specialization_of
struct  is_specialization_of< RefTemplate< Args... >, RefTemplate >
struct  tuple_element_or_default
struct  composes
struct  composes< F >
struct  saturates
class  ArgParser
struct  IsCharArray
struct  IsCharArray< char[N]>
struct  IsCharArray< const char[N]>
struct  IsCharArray< char(&)[N]>
struct  IsCharArray< const char(&)[N]>
struct  DeviceMem
 Manages device memory allocation and host-device data transfers. More...
struct  FillUniformDistribution
struct  FillUniformDistribution< ck_tile::pk_int4_t >
struct  FillUniformDistribution_Unique
struct  FillNormalDistribution
struct  FillUniformDistributionIntegerValue
struct  FillNormalDistributionIntegerValue
struct  FillMonotonicSeq
struct  FillStepRange
struct  FillConstant
struct  AdjustToStructuredSparsity
 Transforms given input to fit 2:4 structured sparsity pattern so every subgroup of 4 elements contain at most 2 non-zero elements. More...
struct  FillTrigValue
struct  HostTensorDescriptor
 Descriptor for tensors in host memory. More...
struct  ParallelTensorFunctor
struct  HostTensor
struct  joinable_thread
struct  reference_layernorm2d_default_epilogue
struct  reference_rmsnorm2d_default_epilogue
struct  RotatingMemWrapper
struct  stream_config
struct  gpu_timer
struct  cpu_timer
struct  AddRmsnorm2dRdquantFwdHostArgs
struct  AddRmsnorm2dRdquantFwd
struct  AddRmsnorm2dRdquantFwdPipelineDefaultPolicy
struct  AddRmsnorm2dRdquantFwdPipelineOnePass
struct  AddRmsnorm2dRdquantFwdPipelineProblem
struct  AddRmsnorm2dRdquantFwdPipelineThreePass
struct  BatchedContractionProblem
struct  TensorDescriptorUtils
 Utility class for creating tensor descriptors in batched contraction operations. More...
struct  BatchedTransposeHostArgs
struct  BatchedTransposeKernel
struct  BatchedTransposeCommonPolicy
struct  BatchedTransposeLdsPipeline
struct  BatchedTransposeLdsPolicy
struct  BatchedTransposeLdsProblem
struct  BatchedTransposePipeline
struct  BatchedTransposePolicy
struct  BatchedTransposeProblem
struct  Generic2dBlockShape
struct  is_pk_int4
struct  is_pk_int4< pk_int4_t >
struct  InterleavedPKTypeLoader
struct  typeToStr
struct  typeToStr< float >
struct  typeToStr< fp16_t >
struct  typeToStr< bf16_t >
struct  typeToStr< fp8_t >
struct  typeToStr< bf8_t >
struct  typeToStr< int8_t >
struct  typeToStr< pk_int4_t >
struct  memOpToStr
struct  memOpToStr< memory_operation_enum::set >
struct  memOpToStr< memory_operation_enum::atomic_add >
struct  memOpToStr< memory_operation_enum::atomic_max >
struct  memOpToStr< memory_operation_enum::add >
struct  ElementWiseKernel
struct  ElementWiseDefaultPolicy
struct  ElementWisePipelineProblem
struct  ElementWiseShape
struct  CShuffleEpilogueProblem
struct  CShuffleEpilogue
struct  Default2DAndDynamicQuantEpilogueProblem
struct  Default2DAndDynamicQuantEpilogue
struct  Default2DEpilogueProblem
struct  DefaultGemm2DEpilogueProblem
struct  Default2DEpilogue
struct  DefaultGemm2DEpilogue
struct  DynamicQuantEpilogueTraits
struct  DynamicQuantEpilogueProblem
struct  DynamicQuantEpilogue
struct  BlockFlatmmASmemBSmemCRegV1
struct  BlockFlatmmASmemBSmemCRegV1CustomPolicy
struct  Flatmm_32x512x128_1x4x1_16x16x32_Base
struct  Flatmm_32x512x128_1x4x1_16x16x32_BF16
struct  Flatmm_32x512x128_1x4x1_16x16x32_FP16
struct  FlatmmSn_32x128x512_1x4x1_16x16x32_Base
struct  FlatmmSn_32x128x512_1x4x1_16x16x32_BF16
struct  FlatmmSn_32x128x512_1x4x1_16x16x32_FP16
struct  FlatmmSn_32x128x512_1x4x1_16x16x32_BF16_itl
struct  FlatmmSn_32x128x512_1x4x1_16x16x32_FP16_itl
struct  FlatmmProblem
struct  FlatmmScalePointer
struct  FlatmmScalePointer< SharedGranularityMN, 0 >
struct  FlatmmScalePointer<-1, 0 >
struct  BaseFlatmmHostArgs
struct  ScaleFlatmmHostArgs
struct  FlatmmKernelArgs
struct  FlatmmKernel
struct  GroupedFlatmmHostArgs
struct  ContiguousGroupedFlatmmHostArgs
struct  MaskedGroupedFlatmmHostArgs
struct  GroupedFlatmmKernel
struct  F16xMXF4FlatmmKernel
struct  MoeFlatmmHostArgs
struct  MoeFlatmmKernel
struct  MXFlatmmKernel
struct  BaseFlatmmPipelineAGmemBGmemCRegV1
struct  FlatmmPipelineAGmemBGmemCRegV1
struct  UniversalFlatmmPipelineAgBgCrPolicy
struct  F16xMXF4FlatmmPipelineProblem
struct  F16xMXF4FlatmmPipelineAGmemBGmemCRegV1
struct  F16xMXF4FlatmmPipelineAgBgCrPolicy
struct  MoeFlatmmPipelineAGmemBGmemCRegV1
struct  MXFlatmmPipelineProblem
struct  MXF4FlatmmPipelineAGmemBGmemCRegV1
struct  MXF4FlatmmPipelineAgBgCrPolicy
struct  TileFlatmmShape
struct  BlockAttentionBiasEnumToStr
struct  BlockAttentionBiasEnumToStr< BlockAttentionBiasEnum::NO_BIAS >
struct  BlockAttentionBiasEnumToStr< BlockAttentionBiasEnum::ELEMENTWISE_BIAS >
struct  BlockAttentionBiasEnumToStr< BlockAttentionBiasEnum::ALIBI >
struct  NullBlockDropout
struct  BlockDropout
struct  BlockDropoutBwd
struct  BlockDropoutBwd< false, IsWG32_, IsStoreRandval_ >
struct  BlockDropoutBwd< true, IsWG32_, IsStoreRandval_ >
struct  GenericAttentionMask
struct  SimplifiedGenericAttentionMask
struct  SimplifiedRatioAttentionMask
struct  Alibi
struct  EmptyPositionEncoding
struct  RotaryEmbeddingEnumToStr
struct  RotaryEmbeddingEnumToStr< RotaryEmbeddingEnum::NONE >
struct  RotaryEmbeddingEnumToStr< RotaryEmbeddingEnum::INTERLEAVED >
struct  RotaryEmbeddingEnumToStr< RotaryEmbeddingEnum::HALF_ROTATED >
struct  BlockRotaryEmbedding
struct  TrivialPageBlockNavigator
struct  PageBlockNavigator
struct  StandardAttentionParams
struct  LogitsSoftCapParams
struct  StandardAttention
struct  LogitsSoftCap
struct  ComposedAttention
struct  FmhaBatchPrefillWithPagedKVCacheKernel
struct  FmhaBwdDQDKDVKernel
struct  FmhaBwdOGradDotOKernel
struct  FmhaBwdConvertQGradKernel
struct  FmhaFwdAppendKVKernel
struct  FmhaFwdAppendKVTilePartitioner
struct  FmhaFwdKernel
struct  FmhaFwdPagedKVKernel
struct  FmhaFwdSplitKVCombineKernel
struct  FmhaFwdSplitKVKernel
struct  FmhaFwdV3Kernel
struct  BlockFmhaBatchPrefillPipelineQRKSVSAsync
struct  BlockFmhaBwdConvertQGrad
struct  BlockFmhaBwdOGradDotO
struct  BlockFmhaBwdDQDKDVPipelineKRKTRVR
struct  BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
class  BlockFmhaBwdDQDKDVPipelineSelector
class  BlockFmhaBwdDQDKDVPipeline
struct  BlockFmhaBwdDQDKDVPipelineTrLoadKRKTRVR
struct  BlockFmhaBwdDQDKDVPipelineTrLoadQRQTRDOR
struct  fmha_bwd_qr_qtr_dor_pipeline
struct  fmha_bwd_qr_qtr_dor_pipeline< T, std::void_t< decltype(T::is_qr_qtr_dor_pipeline)> >
struct  BlockFmhaBwdPipelineDefaultPolicy
struct  BlockFmhaBwdPipelineProblem
struct  BlockFmhaBwdOGradDotOPipelineProblem
struct  BlockFmhaBwdConvertQGradPipelineProblem
struct  BlockFmhaBwdPipelineTrLoadDefaultPolicy
struct  BlockFmhaFwdAppendKVPipeline
struct  BlockFmhaFwdAppendKVPipelineDefaultPolicy
struct  BlockFmhaFwdPagedKVPipelineQRKSVS
struct  BlockFmhaFwdPagedKVPipelineQRKSVSDefaultPolicy
struct  BlockFmhaFwdSplitKVCombinePipeline
struct  BlockFmhaFwdSplitKVCombinePipelineDefaultPolicy
struct  BlockFmhaFwdSplitKVPipelineNWarpSShuffleQRKSVS
struct  BlockFmhaFwdSplitKVPipelineNWarpSShuffleQRKSVSDefaultPolicy
struct  BlockFmhaFwdSplitKVPipelineQRKSVS
struct  BlockFmhaFwdSplitKVPipelineQRKSVSDefaultPolicy
struct  CoreLoopScheduler
struct  CoreLoopScheduler< PipelineProblem, true >
struct  CoreLoopScheduler< PipelineProblem, false >
struct  BlockFmhaFwdV3Pipeline
struct  BlockFmhaV3PipelineDefaultPolicy
struct  BlockFmhaPipelineEnumToStr
struct  BlockFmhaPipelineEnumToStr< BlockFmhaPipelineEnum::QRKSVS >
struct  BlockFmhaPipelineEnumToStr< BlockFmhaPipelineEnum::QRKSVS_ASYNC >
struct  BlockFmhaPipelineEnumToStr< BlockFmhaPipelineEnum::QSKSVS >
struct  BlockFmhaPipelineEnumToStr< BlockFmhaPipelineEnum::QRKSVS_ASYNC_TRLOAD >
struct  BlockFmhaPipelineProblem
struct  BlockFmhaFwdPagedKVPipelineProblem
struct  BlockFmhaFwdSplitKVPipelineProblem
struct  BlockFmhaSplitKVCombinePipelineTileSizes
struct  BlockFmhaSplitKVCombinePipelineProblem
struct  BlockFmhaFwdAppendKVPipelineProblem
struct  BlockFmhaFwdV3PipelineProblem
struct  BlockFmhaPipelineQRKSVS
struct  BlockFmhaPipelineQRKSVSAsync
struct  BlockFmhaPipelineQRKSVSAsyncTrload
struct  BlockFmhaPipelineQRKSVSAsyncTrloadDefaultPolicy
struct  BlockFmhaPipelineQRKSVSFp8
struct  BlockFmhaPipelineQRKSVSWholeKPrefetch
struct  BlockFmhaPipelineQRKSVSWholeKPrefetchDefaultPolicy
struct  BlockFmhaPipelineQSKSVS
struct  BlockFmhaPipelineQSKSVSDefaultPolicy
struct  BlockFmhaPipelineQXCustomPolicy
struct  BlockFmhaPipelineQXCustomPolicy< true >
struct  BlockFmhaPipelineQXCustomPolicy< false >
struct  BlockFmhaPipelineQXKSVSCustomPolicy
struct  TileFmhaShape
struct  TileFmhaBwdShape
struct  TileFmhaTraits
struct  TileFmhaBwdTraits
struct  TileFmhaFwdPagedKVTraits
struct  TileFmhaFwdSplitKVTraits
struct  TileFmhaFwdSplitKVCombineTraits
struct  TileFmhaFwdAppendKVTraits
struct  TileFmhaBwdOGradDotOTraits
struct  TileFmhaBwdConvertQGradTraits
struct  TileFmhaFwdV3Traits
struct  FusedMoeGemmHostArgs
struct  FusedMoeGemmKernel
struct  FusedMoeGemmShape
struct  FusedMoeGemmTilePartitioner_Linear
struct  MoeSortingHostArgs
struct  MoeSortingKernel
struct  MoeSortingClearWorkspaceKernel
struct  MoeSortingMultiPhaseKernel_P0_v1
struct  MoeSortingMultiPhaseKernel_P0_v2
struct  MoeSortingMultiPhaseKernel_P1
struct  MoeSortingMultiPhaseKernel_P2
struct  MoeSortingMultiPhaseKernel_P3
struct  MoeSortingMultiPhaseKernel_P23
struct  MoeSortingProblem
struct  MoeSortingProblemEx
struct  MoeSortingProblemMp
struct  MoeSortingClearWorkspaceProblem
struct  FusedMoeGemmPipeline_FlatmmEx
struct  FusedMoeGemmPipelineFlatmmPolicy
struct  FusedMoeGemmPipeline_FlatmmUk
struct  FusedMoeGemmPipelineProblem
struct  FusedMoeGemmTraits
struct  MoeSortingPolicy
struct  BlockGemmARegBGmemCRegV1
struct  BlockGemmARegBGmemCRegV1DefaultPolicy
struct  BlockGemmARegBRegCRegV1
struct  BlockGemmARegBRegCRegV1CustomPolicy
struct  BlockGemmARegBRegCRegV1DefaultPolicy
struct  BlockGemmARegBRegCRegV2
struct  BlockGemmARegBRegCRegV2CustomPolicy
struct  BlockGemmARegBSmemCRegOneWarpV1
struct  BlockGemmARegBSmemCRegV1
struct  BlockGemmARegBSmemCRegV1CustomPolicy
struct  BlockGemmARegBSmemCRegV1DefaultPolicy
struct  BlockGemmARegBSmemCRegV2
struct  BlockGemmARegBSmemCRegV2CustomPolicy
struct  BlockGemmARegBSmemCRegV2DefaultPolicy
struct  BlockGemmARegBSmemCRegV2R1
struct  BlockGemmASmemBRegCRegV1
struct  BlockGemmASmemBRegCRegV1CustomPolicy
struct  BlockGemmASmemBRegCRegV1DefaultPolicy
struct  BlockGemmASmemBSmemCRegV1
struct  BlockGemmASmemBSmemCRegV1CustomPolicy
struct  BlockGemmASmemBSmemCRegV1DefaultPolicy
struct  BlockGemmProblem
struct  BlockUniversalGemmAsBsCr
struct  BlockWeightPreshuffleASmemBSmemCRegV1
struct  BlockWeightPreshuffleASmemBSmemCRegV1CustomPolicy
struct  BatchedGemmHostArgs
 The Batched GEMM kernel host arguments. More...
struct  BatchedGemmKernel
struct  GemmHostArgs
 The GEMM kernel host arguments. More...
struct  GemmKernel
struct  GemmMultiABDHostArgs
 The MultiABD GEMM kernel host arguments. More...
struct  GemmKernelMultiABD
struct  GemmMultiDHostArgs
 The MultiD GEMM kernel host arguments. More...
struct  GemmKernelMultiD
struct  GemmTile2DPartitioner
 Class providing 2D workgroup index mapping into 2D output GEMM C-tile space. More...
struct  GemmTile1DPartitioner
 Class providing 1D WGP index mapping into 2D output C-tile space. More...
struct  HasFnOneArgImpl
 GemmTile1DPartitioner::GetOutputTileIndex's std::false specialization, checking expression validity in-place for ill-formed. More...
struct  HasFnOneArgImpl< T, std::void_t< decltype(std::declval< T >().GetOutputTileIndex(1))> >
 GemmTile1DPartitioner::GetOutputTileIndex's std::true specialization, checking expression validity in-place for well-formed. More...
struct  OffsettedTile1DPartitioner
 Struct used to calculate offseted tile indexes. More...
struct  GemmSpatiallyLocalTilePartitioner
 Class mapping 1D block index into 2D output tile space. More...
struct  StreamKTilePartitioner
 Stream-K tile partitioner that dynamically balances work across workgroups. More...
struct  GroupedGemmHostArgs
 The Grouped GEMM kernel host arguments. More...
struct  GemmTransKernelArg
struct  GroupedGemmKernel
struct  StreamKHostArgs
 The Stream K GEMM kernel host arguments. More...
struct  StreamKKernel
struct  StreamKTilePartitionerBase
 Stream-K tile partitioner base class. More...
struct  StreamKTilePartitioner_v2
 Template for the Stream-K tile partitioner derived struct. More...
struct  StreamKTilePartitioner_v2< BlockGemmShapeType, ReductionStrategyType, true >
 Persistent Stream-K tile partitioner derived struct. More...
struct  StreamKTilePartitioner_v2< BlockGemmShapeType, ReductionStrategyType, false >
 Non-Persistent Stream-K tile partitioner derived struct. More...
struct  UniversalGemmHostArgs
 The Universal GEMM kernel host arguments. More...
struct  UniversalGemmKernelArgs
 The GEMM kernel device arguments. More...
struct  UniversalGemmKernel
 The Universal GEMM kernel template. More...
struct  GemmPipelineAgBgCrImplBase
struct  BaseGemmPipelineAgBgCrCompAsync
struct  GemmPipelineAgBgCrCompAsync
 Compute optimized pipeline version async; which is based on V4. More...
struct  GemmPipelineAgBgCrCompAsyncDefaultPolicy
struct  BaseGemmPipelineAgBgCrCompV3
struct  GemmPipelineAgBgCrCompV3
struct  BaseGemmPipelineAgBgCrCompV4
struct  GemmPipelineAgBgCrCompV4
 Compute optimized pipeline version 4. More...
struct  GemmPipelineAgBgCrCompV4DefaultPolicy
struct  BaseGemmPipelineAgBgCrCompV5
struct  GemmPipelineAgBgCrCompV5
struct  GemmPipelineAgBgCrCompV5DefaultPolicy
struct  BaseGemmPipelineAgBgCrCompV6
struct  GemmPipelineAgBgCrCompV6
struct  GemmPipelineAgBgCrCompV6DefaultPolicy
struct  BaseGemmPipelineAgBgCrMem
struct  GemmPipelineAgBgCrMem
struct  GemmPipelineAGmemBGmemCRegV1
struct  GemmPipelineAGmemBGmemCRegV1DefaultPolicy
struct  GemmPipelineAGmemBGmemCRegV2
struct  GemmPipelineProblemBase
struct  UniversalGemmPipelineProblem
struct  FlatmmPipelineProblem
struct  has_a_tile_access_pattern
struct  has_a_tile_access_pattern< T, std::void_t< decltype(T::ATileAccessPattern)> >
struct  has_b_tile_access_pattern
struct  has_b_tile_access_pattern< T, std::void_t< decltype(T::BTileAccessPattern)> >
struct  UniversalGemmBasePolicy
struct  UniversalGemmPipelineAgBgCrPolicy
struct  TileGemmShape
struct  TileGemmTraits
struct  TileGemmUniversalTraits
struct  UniversalWeightPreshufflePipelineAgBgCrPolicy
struct  BaseWeightPreshufflePipelineAGmemBGmemCRegV2
struct  WeightPreshufflePipelineAGmemBGmemCRegV2
struct  WarpGemmAttributeMfma
struct  WarpGemmAttributeMfmaIterateK
struct  WarpGemmAttributeMfmaTransposedCDistribution
struct  WarpGemmAttributeMfmaTransposedCDistribution_SwizzleB
struct  WarpGemmAttributeMfmaIterateKAndTransposedCDistribution
struct  WarpGemmAttributeMfmaIterateKAndTransposedCDistribution_SwizzleB
struct  WarpGemmAttributeMfmaIterateK_SwizzleA
struct  WarpGemmAttributeMfmaImplF32F32F32M16N16K4
struct  WarpGemmAttributeMfmaImplF32F32F32M32N32K2
struct  WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K32
struct  WarpGemmAttributeMfmaImplF16F16F32M32N32K8
struct  WarpGemmAttributeMfmaImplF16F16F32M16N16K16
struct  WarpGemmAttributeMfmaImplF16F16F32M16N16K32
struct  WarpGemmAttributeMfmaImplF16F16F32M4N64K4
struct  WarpGemmAttributeMfmaImplF16F16F32M64N4K4
struct  WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8
struct  WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16
struct  WarpGemmAttributeMfmaImplBf16Bf16F32M4N64K4
struct  WarpGemmAttributeMfmaImplBf16Bf16F32M64N4K4
struct  WarpGemmAttributeMfmaImplF16F16F32M32N32K16
struct  WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K16
struct  WarpGemmAttributeMfmaImpl_f32_16x16x32_f8_base
struct  WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base
struct  WarpGemmAttributeMfmaImpl_f32_16x16x128_f8_bf8_base
struct  WarpGemmAttributeMfmaScaleImpl_f32_16x16x128_fp4
struct  WarpGemmAttributeMfmaImpl_f32_32x32x64_f8_bf8_base
struct  WarpGemmAttributeMfmaImpl_i32_32x32x16_i8
struct  WarpGemmAttributeMfmaImpl_i32_16x16x32_i8
struct  WarpGemmAttributeMfmaImpl_i32_16x16x64_i8
struct  WarpGemmAttributeMfmaImpl_i32_32x32x32_i8
struct  WarpGemmAttributeSmfmac
 Class describing structured sparsity mfma instructions. More...
struct  WarpGemmAttributeSmfmacImplF16F16F32M32N32K16
struct  WarpGemmAttributeSmfmacImplF16F16F32M16N16K32
struct  AWarpDstrEncodingTrait
struct  BWarpDstrEncodingTrait
struct  CWarpDstrEncodingTrait
struct  CTransposedWarpDstrEncodingTrait
struct  WarpGemmAttributeWmma
struct  WmmaTraits
struct  WarpGemmAttributeWmmaImpl
struct  has_wmma_traits
struct  WmmaTraits< gfx11_t, fp16_t, fp16_t, float, 16, 16, 16 >
struct  WmmaTraits< gfx11_t, bf16_t, bf16_t, float, 16, 16, 16 >
struct  WmmaTraits< gfx12_t, fp16_t, fp16_t, float, 16, 16, 16 >
struct  WmmaTraits< gfx12_t, bf16_t, bf16_t, float, 16, 16, 16 >
struct  WmmaTraits< gfx11_t, int8_t, int8_t, int32_t, 16, 16, 16 >
struct  WmmaTraits< gfx12_t, int8_t, int8_t, int32_t, 16, 16, 16 >
struct  WmmaTraits< gfx12_t, fp8_t, fp8_t, float, 16, 16, 16 >
struct  WmmaTraits< gfx12_t, bf8_t, bf8_t, float, 16, 16, 16 >
struct  WmmaTraits< gfx12_t, fp8_t, bf8_t, float, 16, 16, 16 >
struct  WmmaTraits< gfx12_t, bf8_t, fp8_t, float, 16, 16, 16 >
struct  WmmaTraitsBase
struct  WmmaTraitsBase< gfx11_t, ADType, BDType, CDType >
struct  WmmaTraitsBase< gfx12_t, ADType, BDType, CDType >
struct  WarpGemmImpl
struct  WarpGemmSmfmacImpl
struct  BlockGemmWeightPreshuffleBQuantARegBRegCReg
struct  BlockGemmAQuantBase
struct  AQuantBlockUniversalGemmAsBsCr
struct  BlockGemmBQuantBase
struct  BQuantBlockUniversalGemmAsBsCr
struct  QuantGemmProblem
struct  QuantGemmHostArgs
struct  QuantGemmKernelArgs
struct  QuantGemmKernel
struct  QuantGroupedGemmHostArgs
 The Grouped GEMM kernel host arguments. More...
struct  QuantGemmTransKernelArg
struct  QuantGroupedGemmKernel
struct  GemmAQuantPipelineAgBgCrImplBase
struct  BaseAQuantGemmPipelineAgBgCrMem
struct  AQuantGemmPipelineAgBgCrMem
struct  GemmAQuantPipelineAgBgCrDefaultPolicy
struct  BaseAQuantGemmPipelineAgBgCrCompV3
struct  AQuantGemmPipelineAgBgCrCompV3
struct  GemmBQuantPipelineAgBgCrImplBase
struct  GemmBQuantPipelineAgBgCrDefaultPolicy
struct  BaseBQuantGemmPipelineAgBgCrCompV3
struct  BQuantGemmPipelineAgBgCrCompV3
struct  tile_distribution_encoding_pattern_aq
struct  tile_distribution_encoding_pattern_aq_transposed_c
struct  tile_distribution_encoding_pattern_bq
struct  QuantGroupShape
struct  GemmQuantPipelineProblemBase
struct  GemmWPQuantPipelineAgBgCrPolicy
struct  WPQuantBPipelineAgBgCrV2
struct  TileGemmQuantTraits
struct  GroupedConvBwdDataKernelArgs
 The Grouped Convolution kernel device arguments. More...
struct  GroupedConvolutionBackwardDataKernel
 The Grouped Convolution Backward Data kernel template. More...
struct  GroupedConvBwdWeightKernelArgs
 The Grouped Convolution kernel device arguments. More...
struct  GroupedConvolutionBackwardWeightKernel
 The Grouped Convolution Backward Weight kernel template. More...
struct  GroupedConvFwdKernelArgs
 The Grouped Convolution kernel device arguments. More...
struct  GroupedConvolutionForwardKernel
 The Grouped Convolution Forward kernel template. More...
struct  GroupedConvHostArgs
 The Grouped Conv kernel host arguments. More...
struct  GroupedConvTraits
struct  SplitImagePieceInfo
 Helper struct for split-image piece information. More...
struct  TransformConvBwdDataToGemm
struct  TransformConvBwdWeightToGemm
struct  TransformConvFwdToGemm
struct  ImageToColumn
struct  BlockImageToColumnProblem
struct  TileImageToColumnShape
struct  Layernorm2dFwdHostArgs
struct  Layernorm2dFwd
struct  Layernorm2dFwdPipelineDefaultPolicy
struct  Layernorm2dFwdPipelineOnePass
struct  Layernorm2dFwdPipelineProblem
struct  Layernorm2dFwdPipelineTwoPass
struct  Layernorm2dXBiasEnumName
struct  Layernorm2dXBiasEnumName< Layernorm2dXBiasEnum::NO_BIAS >
struct  Layernorm2dXBiasEnumName< Layernorm2dXBiasEnum::ADD_BIAS >
struct  Layernorm2dFusedAddEnumName
struct  Layernorm2dFusedAddEnumName< Layernorm2dFusedAddEnum::NO_ADD >
struct  Layernorm2dFusedAddEnumName< Layernorm2dFusedAddEnum::PRE_ADD_STORE >
struct  Layernorm2dFusedAddEnumName< Layernorm2dFusedAddEnum::PRE_ADD >
struct  Layernorm2dFusedQuantEnumName
struct  Layernorm2dFusedQuantEnumName< Layernorm2dFusedQuantEnum::NO_SWEEP >
struct  Layernorm2dFusedQuantEnumName< Layernorm2dFusedQuantEnum::DYNAMIC_QUANT >
struct  Layernorm2dFusedQuantEnumName< Layernorm2dFusedQuantEnum::SMOOTH_DYNAMIC_QUANT >
struct  Layernorm2dFwdTraits
struct  BlockNormReduce
struct  BlockNormReduceSync
struct  BlockNormReduceCrossWarpSync
struct  BlockNormReduceProblem
struct  GenericPermuteHostArgs
struct  GenericPermute
struct  GenericPermuteProblem
struct  PoolHostArgs
 Host arguments for pooling operations. More...
struct  PoolKernelArgs
 Kernel arguments for pooling operations. More...
struct  PoolKernel
struct  PoolDefaultPolicy
struct  PoolProblem
struct  PoolShape
struct  BlockReduce2D
struct  BlockReduce2d
struct  BlockReduce2dSync
struct  BlockReduce2dCrossWarpSync
struct  BlockReduce2dLinearCrossWarpSync
struct  BlockReduce2dProblem
struct  Reduce
struct  Reduce2dDefaultPolicy
struct  Reduce2dProblem
struct  Reduce2dShape
struct  Rmsnorm2dFwdHostArgs
struct  Rmsnorm2dFwd
struct  Rmsnorm2dFwdPipelineDefaultPolicy
struct  Rmsnorm2dFwdPipelineModelSensitiveT5Pass
 This T5Pass implements the RMSNorm2d forward pipeline as a variant based on Rmsnorm2dFwdPipelineOnePass and Rmsnorm2dFwdPipelineTwoPass using a T5 model-like method. More...
struct  Rmsnorm2dFwdPipelineOnePass
struct  Rmsnorm2dFwdPipelineProblem
struct  Rmsnorm2dFwdPipelineTwoPass
struct  Rmsnorm2dFusedAddEnumName
struct  Rmsnorm2dFusedAddEnumName< Rmsnorm2dFusedAddEnum::NO_ADD >
struct  Rmsnorm2dFusedAddEnumName< Rmsnorm2dFusedAddEnum::PRE_ADD_STORE >
struct  Rmsnorm2dFusedAddEnumName< Rmsnorm2dFusedAddEnum::PRE_ADD >
struct  Rmsnorm2dFusedQuantEnumName
struct  Rmsnorm2dFusedQuantEnumName< Rmsnorm2dFusedQuantEnum::NO_SWEEP >
struct  Rmsnorm2dFusedQuantEnumName< Rmsnorm2dFusedQuantEnum::DYNAMIC_QUANT >
struct  Rmsnorm2dFusedQuantEnumName< Rmsnorm2dFusedQuantEnum::SMOOTH_DYNAMIC_QUANT >
struct  Rmsnorm2dSensitiveEnumName
struct  Rmsnorm2dSensitiveEnumName< Rmsnorm2dSensitiveEnum::NO_SPECIFIC_MODEL >
struct  Rmsnorm2dSensitiveEnumName< Rmsnorm2dSensitiveEnum::T5_MODEL_LIKE >
struct  Rmsnorm2dFwdTraits
struct  MoeSmoothquantHostArgs
struct  MoeSmoothquant
struct  SmoothquantHostArgs
struct  Smoothquant
struct  SmoothquantPipelineDefaultPolicy
struct  SmoothquantPipelineOnePass
struct  SmoothquantPipelineProblem
struct  SmoothquantPipelineTwoPass
struct  BlockSoftmax2D
struct  BlockSoftmax2DProblem
struct  BlockTopkStream2D
struct  BlockTopkStream2DProblem
struct  TopkSoftmaxHostArgs
struct  TopkSoftmaxKernel
struct  TopkSoftmaxWarpPerRowPipeline
struct  TopkSoftmaxWarpPerRowPolicy
struct  TopkSoftmaxWarpPerRowProblem
struct  naive_attention_fwd_args
struct  naive_attention_fwd_traits
struct  naive_attention_fwd_kernel_traits
struct  naive_attention_fwd_kernel

Typedefs

template<typename T>
using safe_underlying_type_t = typename safe_underlying_type<T, std::is_enum<T>::value>::type
using Waitcnt = WaitcntLayoutLegacy
template<index_t N>
using multi_index = array<index_t, N>
template<index_t N>
using make_index_sequence
template<typename... Seqs>
using sequence_merge_t = typename sequence_merge<Seqs...>::type
template<index_t NSize, index_t I>
using uniform_sequence_gen_t = typename uniform_sequence_gen<NSize, I>::type
template<typename T, index_t N>
using statically_indexed_array = tuple_array<T, N>
template<typename T, index_t N>
using thread_buffer = tuple_array<T, N>
template<typename T, index_t N>
using tuple_array = typename impl::tuple_array_impl<T, N>::type
template<typename T>
using is_tuple = decltype(std::declval<T&>().IsTuple())
using bfloat16_t = ushort
using bf16_t = bfloat16_t
using bf16_raw_t = uint16_t
using e8m0_t = e8m0_bexp_t
using e8m0_raw_t = typename e8m0_t::raw_type
using fp8_t = _BitInt(8)
using fp8_raw_t = uint8_t
using bf8_t = unsigned _BitInt(8)
using bf8_raw_t = uint8_t
using fp16_hip_t = _Float16
using fp16_raw_t = uint16_t
using fp16_t = _Float16
using half_t = _Float16
using fp16x2_t = _Float16
using int8_t = int8_t
using index_t = int32_t
using int32_t = int32_t
using long_index_t = int64_t
template<index_t v>
using number = constant<v>
template<long_index_t v>
using long_number = constant<v>
template<bool b>
using bool_constant = constant<b>
using fp32_t = float
using fp32x2_t = float
using bf16x2_t = bfloat16_t
using pk_fp4_t = pk_float4_e2m1_t
using pk_fp4_raw_t = typename pk_fp4_t::type
using int8x2_t = int8_t
template<typename T, index_t N>
using ext_vector_t = typename impl::ext_vector<T, N>::type
template<typename X, typename Y>
using has_same_scalar_type
using fp64_t = double
using fp64x2_t = double
using fp64x4_t = double
using fp32x4_t = float
using fp32x8_t = float
using fp32x16_t = float
using fp32x32_t = float
using fp32x64_t = float
using fp16x4_t = _Float16
using fp16x8_t = _Float16
using fp16x16_t = _Float16
using fp16x32_t = _Float16
using fp16x64_t = _Float16
using bf16x4_t = bfloat16_t
using bf16x8_t = bfloat16_t
using bf16x16_t = bfloat16_t
using bf16x32_t = bfloat16_t
using bf16x64_t = bfloat16_t
using int32x2_t = int32_t
using int32x4_t = int32_t
using int32x8_t = int32_t
using int32x16_t = int32_t
using int32x32_t = int32_t
using int32x64_t = int32_t
using uint32x2_t = uint32_t
using uint32x4_t = uint32_t
using uint32x8_t = uint32_t
using uint32x16_t = uint32_t
using uint32x32_t = uint32_t
using uint32x64_t = uint32_t
using int16x2_t = int16_t
using int16x4_t = int16_t
using int16x8_t = int16_t
using int16x16_t = int16_t
using int16x32_t = int16_t
using int16x64_t = int16_t
using uint16x2_t = uint16_t
using uint16x4_t = uint16_t
using uint16x8_t = uint16_t
using uint16x16_t = uint16_t
using uint16x32_t = uint16_t
using uint16x64_t = uint16_t
using int8x4_t = int8_t
using int8x8_t = int8_t
using int8x16_t = int8_t
using int8x32_t = int8_t
using int8x64_t = int8_t
using uint8x2_t = uint8_t
using uint8x4_t = uint8_t
using uint8x8_t = uint8_t
using uint8x16_t = uint8_t
using uint8x32_t = uint8_t
using uint8x64_t = uint8_t
using fp8x2_t = fp8_t
using fp8x4_t = fp8_t
using fp8x8_t = fp8_t
using fp8x16_t = fp8_t
using fp8x32_t = fp8_t
using fp8x64_t = fp8_t
using bf8x2_t = bf8_t
using bf8x4_t = bf8_t
using bf8x8_t = bf8_t
using bf8x16_t = bf8_t
using bf8x32_t = bf8_t
using bf8x64_t = bf8_t
using pk_int4x2_t = int8_t
using pk_int4x4_t = int8_t
using pk_int4x8_t = int8_t
using pk_int4x16_t = int8_t
using pk_int4x32_t = int8_t
using pk_fp4x2_t = uint8_t __attribute((ext_vector_type(2)))
using pk_fp4x4_t = uint8_t __attribute((ext_vector_type(4)))
using pk_fp4x8_t = uint8_t __attribute((ext_vector_type(8)))
using pk_fp4x16_t = uint8_t __attribute((ext_vector_type(16)))
using pk_fp4x32_t = uint8_t __attribute((ext_vector_type(32)))
template<typename TileDistributionEncoding_, typename DataType_, typename Policy = DefaultTranspose<DataType_>>
using OutputTileDistributionTraits
template<typename TileDistributionEncoding_, typename DataType_, typename Policy = DefaultTranspose<DataType_>>
using InputTileDistributionTraits
template<typename encoding, typename shuffle>
using tile_distribution_encoding_shuffle_t
template<typename TensorView_>
using default_linear_bottom_dims
using magic_division = magic_division32_bit_range
template<typename T>
using remove_reference_t = typename std::remove_reference<T>::type
template<typename T>
using remove_cv_t = typename std::remove_cv<T>::type
template<typename T>
using remove_cvref_t = remove_cv_t<std::remove_reference_t<T>>
template<typename T>
using remove_pointer_t = typename std::remove_pointer<T>::type
template<typename From, typename To>
using copy_const_t = typename copy_const<From, To>::type
template<template< class... > class Op, class... Args>
using is_detected = typename detail::detector<nonesuch, void, Op, Args...>::value_t
template<typename T>
using is_static = impl::is_static_impl<remove_cvref_t<T>>
template<typename T>
using is_known_at_compile_time = is_static<T>
template<typename Tuple_, std::size_t Idx, typename DefaultType>
using tuple_element_or_default_t
using F8 = ck_tile::fp8_t
 8-bit floating point type
using BF8 = ck_tile::bf8_t
 8-bit brain floating point type
using F16 = ck_tile::half_t
 16-bit floating point (half precision) type
using BF16 = ck_tile::bf16_t
 16-bit brain floating point type
using F32 = float
 32-bit floating point (single precision) type
using I8 = int8_t
 8-bit signed integer type
using I32 = int32_t
 32-bit signed integer type
template<typename T>
using iter_value_t = typename std::iterator_traits<remove_cvref_t<T>>::value_type
template<typename T>
using iter_reference_t = decltype(*std::declval<T&>())
template<typename T>
using iter_difference_t = typename std::iterator_traits<remove_cvref_t<T>>::difference_type
template<bool kPadM_, bool kPadN_, bool UseSmoothInputScale_, bool UseRawStore_ = true, bool UseMax3_ = false>
using Default2DAndDynamicQuantEpilogueTraits
template<int NumberTensor = 0>
using FlatmmHostArgs
using BlockFmhaBatchPrefillPipelineQRKSVSAsyncDefaultPolicy
using BlockFmhaPipelineQRKSVSAsyncDefaultPolicy
using BlockFmhaPipelineQRKSVSDefaultPolicy
using GemmPipelineAGmemBGmemCRegV2DefaultPolicy = GemmPipelineAGmemBGmemCRegV1DefaultPolicy
template<typename AsDataType_, typename BsDataType_, typename EDataType_, typename BlockGemmShape_, typename Traits_, typename AElementWise_ = ck_tile::element_wise::PassThrough, typename BElementWise_ = ck_tile::element_wise::PassThrough, typename ComputeDataType_ = AsDataType_, bool FixedVectorSize_ = false, index_t VectorSizeA_ = 1, index_t VectorSizeB_ = 1>
using GemmPipelineProblem
template<bool kPadM_, bool kPadN_, bool kPadK_, bool DoubleSmemBuffer_, typename AsLayout_, typename BsLayout_, typename CLayout_, bool TransposeC_ = false, bool UseStructuredSparsity_ = false>
using PersistentTileGemmUniversalTraits
using WarpGemmMfmaF32F32F32M16N16K4
template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using WarpGemmMfmaF32F32F32M16N16K16
template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using WarpGemmMfmaF32F32F32M16N16K16TransposedCDistribution
using WarpGemmMfmaF16F16F32M32N32K8
using WarpGemmMfmaF16F16F32M16N16K16
template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using WarpGemmMfmaF16F16F32M32N32K16
template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using WarpGemmMfmaF16F16F32M16N16K32
using WarpGemmMfmaF16F16F32M32N32K8SwizzleA
using WarpGemmMfmaF16F16F32M32N32K16SwizzleA
using WarpGemmMfmaF16F16F32M32N32K8TransposedCDistribution
using WarpGemmMfmaF16F16F32M16N16K16TransposedCDistribution
template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using WarpGemmMfmaF16F16F32M32N32K16TransposedCDistribution
template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution
using WarpGemmMfmaF16F16F32M32N32K8SwizzleBTransposedCDistribution
using WarpGemmMfmaF16F16F32M32N32K16SwizzleBTransposedCDistribution
using WarpGemmMfmaF16F16F32M4N64K16
using WarpGemmMfmaF16F16F32M64N4K16
using WarpGemmSmfmacF16F16F32M32N32K16
using WarpGemmSmfmacF16F16F32M16N16K32
using WarpGemmMfmaBf16Bf16F32M32N32K8
using WarpGemmMfmaBf16Bf16F32M16N16K16
template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using WarpGemmMfmaBf16Bf16F32M32N32K16
template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using WarpGemmMfmaBf16Bf16F32M16N16K32
using WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleA
using WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleA
using WarpGemmMfmaBf16Bf16F32M32N32K8TransposedCDistribution
using WarpGemmMfmaBf16Bf16F32M16N16K16TransposedCDistribution
template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using WarpGemmMfmaBf16Bf16F32M32N32K16TransposedCDistribution
template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution
using WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleBTransposedCDistribution
using WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleBTransposedCDistribution
using WarpGemmMfmaBf16Bf16F32M4N64K16
using WarpGemmMfmaBf16Bf16F32M64N4K16
using WarpGemmMfma_f32_32x32x16_fp8_fp8
using WarpGemmMfma_f32_32x32x16_fp8_bf8
using WarpGemmMfma_f32_16x16x32_fp8_bf8
using WarpGemmMfma_f32_32x32x16_bf8_fp8
using WarpGemmMfma_f32_32x32x16_bf8_bf8
using WarpGemmMfma_f32_32x32x32_fp8_fp8
using WarpGemmMfma_f32_32x32x32_bf8_bf8
using WarpGemmMfma_f32_32x32x32_fp8_bf8
using WarpGemmMfma_f32_16x16x32_fp8_fp8
using WarpGemmMfma_f32_16x16x32_fp8_fp8_CTransposed
using WarpGemmMfma_f32_16x16x32_bf8_bf8
using WarpGemmMfma_f32_16x16x32_bf8_bf8_CTransposed
using WarpGemmMfma_f32_16x16x64_fp8_fp8
using WarpGemmMfma_f32_16x16x64_bf8_bf8
template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using WarpGemmMfma_f32_16x16x128_fp4
template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using WarpGemmMfma_f32_16x16x128_fp8_fp8
template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using WarpGemmMfma_f32_16x16x128_fp8_bf8
template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using WarpGemmMfma_f32_16x16x128_bf8_fp8
template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using WarpGemmMfma_f32_16x16x128_bf8_bf8
template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using WarpGemmMfma_f32_16x16x128_fp8_fp8_CTransposed
template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using WarpGemmMfma_f32_16x16x128_fp8_bf8_CTransposed
template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using WarpGemmMfma_f32_16x16x128_bf8_fp8_CTransposed
template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using WarpGemmMfma_f32_16x16x128_bf8_bf8_CTransposed
template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using WarpGemmMfma_f32_32x32x64_fp8_fp8
template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using WarpGemmMfma_f32_32x32x64_fp8_bf8
template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using WarpGemmMfma_f32_32x32x64_bf8_fp8
template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using WarpGemmMfma_f32_32x32x64_bf8_bf8
using WarpGemmMfma_f32_32x32x16_fp8_fp8_CTransposed
using WarpGemmMfma_f32_32x32x16_fp8_bf8_CTransposed
using WarpGemmMfma_f32_32x32x16_bf8_fp8_CTransposed
using WarpGemmMfma_f32_32x32x16_bf8_bf8_CTransposed
template<index_t swizzle_factor = 2>
using WarpGemmMfmaFp8Fp8F32M32N32K32SwizzleBTransposedCDistribution
using WarpGemmMfma_i32_32x32x16_i8_i8
using WarpGemmMfma_i32_32x32x16_i8_i8_CTransposed
using WarpGemmMfma_i32_16x16x32_i8_i8
using WarpGemmMfma_i32_16x16x32_i8_i8_CTransposed
template<WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
using WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_fp8
template<WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
using WarpGemmAttributeMfmaImpl_f32_16x16x32_fp8_fp8
template<WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
using WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_bf8
template<WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
using WarpGemmAttributeMfmaImpl_f32_16x16x32_fp8_bf8
template<WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
using WarpGemmAttributeMfmaImpl_f32_16x16x32_bf8_bf8
template<WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
using WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_fp8
template<WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
using WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_bf8
template<WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
using WarpGemmAttributeMfmaImpl_f32_16x16x128_fp8_fp8
template<WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
using WarpGemmAttributeMfmaImpl_f32_16x16x128_fp8_bf8
template<WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
using WarpGemmAttributeMfmaImpl_f32_16x16x128_bf8_fp8
template<WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
using WarpGemmAttributeMfmaImpl_f32_16x16x128_bf8_bf8
template<WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
using WarpGemmAttributeMfmaImpl_f32_32x32x64_fp8_fp8
template<WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
using WarpGemmAttributeMfmaImpl_f32_32x32x64_fp8_bf8
template<WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
using WarpGemmAttributeMfmaImpl_f32_32x32x64_bf8_fp8
template<WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
using WarpGemmAttributeMfmaImpl_f32_32x32x64_bf8_bf8
using DeviceIp = remove_cvref_t<decltype(ck_tile::get_device_arch())>
using WarpGemmAttributeWmmaImpl_f32_16x16x16_f16_f16
using WarpGemmAttributeWmmaImpl_f32_16x16x16_bf16_bf16
using WarpGemmAttributeWmmaImpl_i32_16x16x16_i8_i8
using WarpGemmAttributeWmmaImpl_f32_16x16x16_f8_f8
using WarpGemmAttributeWmmaImpl_f32_16x16x16_bf8_bf8
using WarpGemmAttributeWmmaImpl_f32_16x16x16_f8_bf8
using WarpGemmAttributeWmmaImpl_f32_16x16x16_bf8_f8
template<typename AType, typename BType, typename AccType, index_t MPerWave, index_t NPerWave, index_t KPerWave, bool TransposeC, bool SwizzleA = false, bool UseStructuredSparsity = false, WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using WarpGemmDispatcher
template<bool kTransC = false>
using WarpGemmWmma_f32_16x16x16_f16_f16
template<bool kTransC = false>
using WarpGemmWmma_f32_16x16x16_bf16_bf16
template<bool kTransC = false>
using WarpGemmWmma_i32_16x16x16_i8_i8
template<bool kTransC = false>
using WarpGemmWmma_f32_16x16x16_f8_f8
template<bool kTransC = false>
using WarpGemmWmma_f32_16x16x16_bf8_bf8
template<bool kTransC = false>
using WarpGemmWmma_f32_16x16x16_f8_bf8
template<bool kTransC = false>
using WarpGemmWmma_f32_16x16x16_bf8_f8
using QuantGroupedGemmKernelArgs = QuantGemmKernelArgs
template<typename ADataType_, typename AQDataType_, typename BDataType_, typename CDataType_, typename BlockGemmShape_, typename Traits_, typename QuantGroupSize_, bool TransposeC_, typename ComputeDataType_ = BDataType_, GemmPipelineScheduler Scheduler_ = GemmPipelineScheduler::Intrawave, bool HasHotLoop_ = true, TailNumber TailNum_ = TailNumber::Full>
using GemmAQuantPipelineProblem
template<typename ADataType_, typename BDataType_, typename BQDataType_, typename CDataType_, typename BlockGemmShape_, typename Traits_, typename QuantGroupSize_, typename ComputeDataType_ = ADataType_, GemmPipelineScheduler Scheduler_ = GemmPipelineScheduler::Intrawave, bool HasHotLoop_ = true, TailNumber TailNum_ = TailNumber::Full>
using GemmBQuantPipelineProblem
template<typename ADataType_, typename BDataType_, typename CDataType_, typename AccDataType_, typename BlockGemmShape_, typename Traits_, bool TransposeC_ = false, typename ComputeDataType_ = BDataType_, GemmPipelineScheduler Scheduler_ = GemmPipelineScheduler::Intrawave, bool HasHotLoop_ = true, TailNumber TailNum_ = TailNumber::Full>
using GemmRowColTensorQuantPipelineProblem
using PassThrough = ck_tile::element_wise::PassThrough
template<typename CDElementwise = PassThrough>
using GroupedConvFwdHostArgs = GroupedConvHostArgs<const void*, const void*, void*, CDElementwise>
using GroupedConvBwdWeightHostArgs
using GroupedConvBwdDataHostArgs

Enumerations

enum struct  coord_transform_enum {
  undefined ,
  pass_through ,
  pad ,
  embed ,
  merge ,
  unmerge ,
  replicate ,
  xor_t ,
  offset ,
  indexing
}
enum struct  tile_distribution_pattern {
  thread_raked ,
  warp_raked ,
  block_raked
}
 Enumeration describing static tile distribution patterns. More...
enum struct  amd_buffer_coherence_enum {
  coherence_default = 0 ,
  glc = 1 ,
  slc = 2 ,
  glc_slc = 3 ,
  WAVE_NT0 = 0 ,
  WAVE_NT1 = 2 ,
  GROUP_NT0 = 1 ,
  GROUP_NT1 = 3 ,
  DEVICE_NT0 = 8 ,
  DEVICE_NT1 = 10 ,
  SYSTEM_NT0 = 9 ,
  SYSTEM_NT1 = 11
}
enum struct  address_space_enum : std::uint16_t {
  generic = 0 ,
  global ,
  lds ,
  sgpr ,
  constant ,
  vgpr
}
enum struct  memory_operation_enum : std::uint16_t {
  set = 0 ,
  atomic_add ,
  atomic_max ,
  add
}
enum  LLVMSchedGroupMask : int32_t {
  NONE = 0 ,
  ALU = 1 << 0 ,
  VALU = 1 << 1 ,
  SALU = 1 << 2 ,
  MFMA = 1 << 3 ,
  VMEM = 1 << 4 ,
  VMEM_READ = 1 << 5 ,
  VMEM_WRITE = 1 << 6 ,
  DS = 1 << 7 ,
  DS_READ = 1 << 8 ,
  DS_WRITE = 1 << 9 ,
  ALL = (DS_WRITE << 1) - 1
}
enum class  bf16_rounding_mode {
  standard = 0 ,
  truncate_with_nan ,
  truncate ,
  standard_asm ,
  rta_asm
}
enum class  fp8_rounding_mode {
  standard = 0 ,
  stochastic
}
enum class  fp8_interpretation {
  E4M3_OCP = 0 ,
  E5M2_OCP = 1 ,
  E4M3_FNUZ = 2 ,
  E5M2_FNUZ = 3
}
 FP8 interpretation used in conversion algorithms. More...
enum  StreamKReductionStrategy : uint32_t {
  Atomic = 0u ,
  Reduction = 1u
}
enum class  MoeFlatmmKind {
  kFFN_gemm1_gate_only ,
  kFFN_gemm1_gate_up ,
  kFFN_gemm2
}
enum class  BlockAttentionBiasEnum {
  NO_BIAS = 0 ,
  ELEMENTWISE_BIAS = 1 ,
  ALIBI = 2
}
enum struct  GenericAttentionMaskEnum {
  NO_MASK = 0 ,
  MASK_FROM_TOP_LEFT = 1 ,
  MASK_FROM_BOTTOM_RIGHT = 2 ,
  MASK_GENERIC
}
enum struct  PositionEncodingEnum {
  NO = 0 ,
  ALIBI = 1
}
enum struct  AlibiMode {
  VERTICAL = 0 ,
  FROM_TOP_LEFT = 1 ,
  FROM_BOTTOM_RIGHT = 2
}
enum class  RotaryEmbeddingEnum {
  NONE = 0 ,
  INTERLEAVED = 1 ,
  HALF_ROTATED = 2
}
enum class  BlockFmhaPipelineEnum {
  QRKSVS = 0 ,
  QRKSVS_ASYNC ,
  QSKSVS ,
  QRKSVS_ASYNC_TRLOAD
}
enum class  FusedMoeGemmWeightPermuteEnum {
  no_permute = 0 ,
  b_nr_kr_kw_nw_kv = 1 ,
  b_nr_kr_waveflatten = b_nr_kr_kw_nw_kv
}
enum class  FusedMoeGemmPipelineSequencerEnum {
  SLD_A = 1 << 0 ,
  SLD_B = 1 << 1 ,
  GLD_A = 1 << 2 ,
  GLD_B = 1 << 3 ,
  SST_A = 1 << 4 ,
  SST_B = 1 << 5 ,
  GST_O = 1 << 6
}
enum struct  GemmLoopOrder {
  KMN ,
  MNK
}
enum struct  GemmPipelineScheduler {
  Default ,
  Intrawave ,
  Interwave
}
enum struct  TailNumber {
  Odd ,
  Even ,
  One ,
  Two ,
  Three ,
  Four ,
  Five ,
  Six ,
  Seven ,
  Empty ,
  Full
}
enum struct  GemmPipeline {
  COMPUTE_ASYNC ,
  COMPUTE_V3 ,
  COMPUTE_V4 ,
  COMPUTE_V5 ,
  COMPUTE_V6 ,
  MEMORY ,
  BASIC_V1 ,
  BASIC_V2 ,
  PRESHUFFLE_V2
}
enum class  WGAttrNumAccessEnum {
  Single = 1 ,
  Double = 2 ,
  Quad = 4 ,
  Invalid = -1
}
enum class  WGAttrCtlEnum {
  Default_ = 0 ,
  Raw_vvv = 1 ,
  Raw_vaa = 2 ,
  Raw_vav = 3 ,
  Raw_vva = 4 ,
  Raw_avv = 5
}
enum struct  QuantType : std::uint16_t {
  AQuantGrouped = 0 ,
  BQuantGrouped = 1 ,
  RowColQuant = 2 ,
  TensorQuant = 3
}
enum struct  ConvolutionSpecialization {
  Default ,
  Filter1x1Pad0 ,
  Filter1x1Stride1Pad0 ,
  Filter3x3
}
enum class  Layernorm2dXBiasEnum {
  NO_BIAS = 0 ,
  ADD_BIAS = 1
}
enum class  Layernorm2dFusedAddEnum {
  NO_ADD = 0 ,
  PRE_ADD_STORE = 1 ,
  PRE_ADD = 2
}
enum class  Layernorm2dFusedQuantEnum {
  NO_SWEEP = 0 ,
  SMOOTH_DYNAMIC_QUANT = 1 ,
  DYNAMIC_QUANT = 2
}
enum class  Rmsnorm2dFusedAddEnum {
  NO_ADD = 0 ,
  PRE_ADD_STORE = 1 ,
  PRE_ADD = 2
}
enum class  Rmsnorm2dFusedQuantEnum {
  NO_SWEEP = 0 ,
  SMOOTH_DYNAMIC_QUANT = 1 ,
  DYNAMIC_QUANT = 2
}
enum class  Rmsnorm2dSensitiveEnum {
  NO_SPECIFIC_MODEL = 0 ,
  T5_MODEL_LIKE = 1
}
enum class  naive_attention_layout_enum {
  DEFAULT ,
  BSHD ,
  BHSD ,
  BS3HD ,
  PHSD ,
  PHDSX ,
  PHDS ,
  SCALE_HS ,
  SCALE_SH
}
enum class  naive_attention_variation_enum {
  FLASH_BATCHED = 0 ,
  FLASH_GROUPED ,
  DECODE_PAGED
}
enum class  naive_attention_quant_algo {
  NO = 0 ,
  KV_8BIT_PERHEAD = 1 ,
  KV_8BIT_PERTOKEN = 2
}

Functions

template<typename Lengths, typename ArrangeOrder = typename arithmetic_sequence_gen<0, Lengths::size(), 1>::type>
CK_TILE_HOST_DEVICE constexpr auto make_cluster_descriptor (const Lengths &lengths, ArrangeOrder order=typename arithmetic_sequence_gen< 0, Lengths::size(), 1 >::type{})
template<typename LowLength>
CK_TILE_HOST_DEVICE constexpr auto make_pass_through_transform (const LowLength &low_length)
template<typename LowLength, typename LeftPad, typename RightPad, bool SkipIsValidCheck = false>
CK_TILE_HOST_DEVICE constexpr auto make_pad_transform (const LowLength &low_length, const LeftPad &left_pad, const RightPad &right_pad, bool_constant< SkipIsValidCheck >=bool_constant< false >{})
template<typename LowLength, typename LeftPadLength, bool SkipIsValidCheck = false>
CK_TILE_HOST_DEVICE constexpr auto make_left_pad_transform (const LowLength &low_length, const LeftPadLength &left_pad_, bool_constant< SkipIsValidCheck >=bool_constant< false >{})
template<typename LowLength, typename RightPadLength, bool SkipIsValidCheck = false>
CK_TILE_HOST_DEVICE constexpr auto make_right_pad_transform (const LowLength &low_length, const RightPadLength &right_pad_, bool_constant< SkipIsValidCheck >=bool_constant< false >{})
template<typename UpLengths, typename Coefficients, typename std::enable_if< UpLengths::size()==Coefficients::size(), bool >::type = false>
CK_TILE_HOST_DEVICE constexpr auto make_embed_transform (const UpLengths &up_lengths, const Coefficients &coefficients)
template<typename LowLengths>
CK_TILE_HOST_DEVICE constexpr auto make_merge_transform_v2_magic_division (const LowLengths &low_lengths)
template<typename LowLengths>
CK_TILE_HOST_DEVICE constexpr auto make_merge_transform_v3_division_mod (const LowLengths &low_lengths)
template<typename LowLengths>
CK_TILE_HOST_DEVICE constexpr auto make_merge_transform (const LowLengths &low_lengths)
template<typename UpLengths, bool Use24BitIntegerCalculation = false>
CK_TILE_HOST_DEVICE constexpr auto make_unmerge_transform (const UpLengths &up_lengths, bool_constant< Use24BitIntegerCalculation >=bool_constant< false >{})
template<typename LowerIndex>
CK_TILE_HOST_DEVICE constexpr auto make_freeze_transform (const LowerIndex &low_idx)
template<typename UpperIndex>
CK_TILE_HOST_DEVICE constexpr auto make_insert_transform (const UpperIndex &up_idx)
template<typename UpLengths>
CK_TILE_HOST_DEVICE constexpr auto make_replicate_transform (const UpLengths &up_lengths)
template<typename LowLength, typename SliceBegin, typename SliceEnd>
CK_TILE_HOST_DEVICE constexpr auto make_slice_transform (const LowLength &low_length, const SliceBegin &slice_begin, const SliceEnd &slice_end)
template<typename Modulus, typename UpLength>
CK_TILE_HOST_DEVICE constexpr auto make_modulo_transform (const Modulus &modulus, const UpLength &up_length)
template<typename LowLengths>
CK_TILE_HOST_DEVICE constexpr auto make_xor_transform (const LowLengths &low_lengths)
template<typename LowLength, typename OffsetLength>
CK_TILE_HOST_DEVICE constexpr auto make_offset_transform (const LowLength &low_length, const OffsetLength &offset_length)
template<typename UpLength, typename Indices>
CK_TILE_HOST_DEVICE constexpr auto make_indexing_transform (const UpLength &up_lengths, const Indices &indices)
template<typename UpLength, typename IndexingAdaptor>
CK_TILE_HOST_DEVICE constexpr auto make_indexing_transform_with_adaptor (const UpLength &up_lengths, const IndexingAdaptor &iadaptor)
constexpr const char * tile_distribution_pattern_to_string (tile_distribution_pattern pattern)
template<index_t BlockSize, index_t YPerTile, index_t XPerTile, index_t VecSize, tile_distribution_pattern DistributionPattern, index_t NumWaveGroups>
CK_TILE_HOST_DEVICE void print (const tile_distribution_encoding_pattern_2d< BlockSize, YPerTile, XPerTile, VecSize, DistributionPattern, NumWaveGroups > &)
__device__ uint32_t amd_wave_read_first_lane (uint16_t v)
__device__ uint32_t amd_wave_read_first_lane (uint8_t v)
__device__ uint32_t amd_wave_read_first_lane (uint32_t value)
__device__ int32_t amd_wave_read_first_lane (int32_t value)
template<typename Object, std::enable_if_t< std::is_trivially_copyable_v< Object >, int > = 0>
__device__ auto amd_wave_read_first_lane (const Object &obj)
template<typename ForceSGPR = std::false_type>
CK_TILE_DEVICE int32x4_t make_wave_buffer_resource (const void *ptr, uint32_t size=0xffffffff, ForceSGPR={})
CK_TILE_DEVICE void buffer_load_fence (index_t cnt=0)
CK_TILE_DEVICE void lds_load_fence (index_t cnt=0)
template<typename... T>
CK_TILE_DEVICE void buffer_load_fence (index_t cnt=0, T &... o)
CK_TILE_DEVICE void buffer_store_fence (index_t cnt=0)
CK_TILE_DEVICE auto async_load_fence_raw (index_t cnt=0)
CK_TILE_DEVICE_EXTERN int8_t llvm_amdgcn_raw_buffer_load_i8 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i8")
CK_TILE_DEVICE_EXTERN int8x2_t llvm_amdgcn_raw_buffer_load_i8x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i8")
CK_TILE_DEVICE_EXTERN int8x4_t llvm_amdgcn_raw_buffer_load_i8x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i8")
CK_TILE_DEVICE_EXTERN int16_t llvm_amdgcn_raw_buffer_load_i16 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i16")
CK_TILE_DEVICE_EXTERN int16x2_t llvm_amdgcn_raw_buffer_load_i16x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i16")
CK_TILE_DEVICE_EXTERN int16x4_t llvm_amdgcn_raw_buffer_load_i16x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i16")
CK_TILE_DEVICE_EXTERN int32_t llvm_amdgcn_raw_buffer_load_i32 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i32")
CK_TILE_DEVICE_EXTERN int32x2_t llvm_amdgcn_raw_buffer_load_i32x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i32")
CK_TILE_DEVICE_EXTERN int32x4_t llvm_amdgcn_raw_buffer_load_i32x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i32")
CK_TILE_DEVICE_EXTERN _Float16 llvm_amdgcn_raw_buffer_load_fp16 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f16")
CK_TILE_DEVICE_EXTERN fp16x2_t llvm_amdgcn_raw_buffer_load_fp16x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f16")
CK_TILE_DEVICE_EXTERN fp16x4_t llvm_amdgcn_raw_buffer_load_fp16x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f16")
CK_TILE_DEVICE_EXTERN float llvm_amdgcn_raw_buffer_load_fp32 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f32")
CK_TILE_DEVICE_EXTERN fp32x2_t llvm_amdgcn_raw_buffer_load_fp32x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f32")
CK_TILE_DEVICE_EXTERN fp32x4_t llvm_amdgcn_raw_buffer_load_fp32x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f32")
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i8 (int8_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i8")
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i8x2 (int8x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i8")
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i8x4 (int8x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i8")
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i16 (int16_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16")
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i16x2 (int16x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i16")
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i16x4 (int16x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i16")
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i32 (int32_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i32")
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_ui16 (uint16_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16")
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_ui16x2 (uint16x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i16")
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_ui16x4 (uint16x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i16")
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i32x2 (int32x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i32")
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i32x4 (int32x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i32")
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_fp16 (_Float16 vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f16")
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_fp16x2 (fp16x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f16")
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_fp16x4 (fp16x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f16")
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_fp32 (float vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f32")
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_fp32x2 (fp32x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f32")
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_fp32x4 (fp32x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f32")
CK_TILE_DEVICE_EXTERN fp16x2_t llvm_amdgcn_raw_buffer_atomic_add_fp16x2 (fp16x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.v2f16")
CK_TILE_DEVICE_EXTERN bf16x2_t llvm_amdgcn_raw_buffer_atomic_add_bf16x2 (bf16x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.v2bf16")
CK_TILE_DEVICE_EXTERN int32_t llvm_amdgcn_raw_buffer_atomic_add_i32 (int32_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.add.i32")
CK_TILE_DEVICE_EXTERN float llvm_amdgcn_raw_buffer_atomic_add_fp32 (float vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.f32")
CK_TILE_DEVICE_EXTERN double llvm_amdgcn_raw_buffer_atomic_max_fp64 (double vdata, int32x4_t rsrc, int voffset, int soffset, int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64")
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_load_lds (int32x4_t rsrc, as3_uint32_ptr lds_ptr, index_t size, index_t voffset, index_t soffset, index_t offset, index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds")
template<unsigned num_dwords, bool pre_nop = false>
CK_TILE_DEVICE void async_buffer_load_dwordxn_v (void *smem, int32x4_t rsrc, index_t voffset, index_t, index_t ioffset, index_t=0, bool_constant< pre_nop >={})
CK_TILE_DEVICE void async_buffer_load_fence (index_t cnt=0)
template<index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default>
CK_TILE_DEVICE thread_buffer< int8_t, N > amd_buffer_load_impl_with_bytes (int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset)
template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default>
CK_TILE_DEVICE thread_buffer< T, N > amd_buffer_load_impl (int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset)
template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true, bool pre_nop = false>
CK_TILE_DEVICE void amd_buffer_load_raw_impl (thread_buffer< T, N > &dst, int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset, index_t src_linear_addr_offset, index_t flag=0, bool_constant< pre_nop >={})
template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool pre_nop = false>
CK_TILE_DEVICE void amd_async_buffer_load_impl (CK_TILE_LDS_ADDR T *smem, int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset, index_t src_immediate_addr_offset=0, bool_constant< pre_nop >={})
template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true>
CK_TILE_DEVICE void amd_async_buffer_load (CK_TILE_LDS_ADDR T *smem, int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset, index_t src_immediate_addr_offset=0, index_t flag=0, bool_constant< oob_conditional_check >={})
template<index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default>
CK_TILE_DEVICE void amd_buffer_store_impl_with_bytes (const thread_buffer< int8_t, N > src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default>
CK_TILE_DEVICE void amd_buffer_store_impl (const thread_buffer< T, N > src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true>
CK_TILE_DEVICE void amd_buffer_store_raw_impl (const thread_buffer< T, N > &dst_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset, index_t dst_linear_addr_offset, index_t is_valid_element=1)
template<typename T, index_t N>
CK_TILE_DEVICE void amd_buffer_atomic_add_impl (const thread_buffer< T, N > &src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
template<typename T, index_t N>
CK_TILE_DEVICE void amd_buffer_atomic_max_impl (const thread_buffer< T, N > src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true>
CK_TILE_DEVICE thread_buffer< T, N > amd_buffer_load_invalid_element_return_zero (const T *p_src_wave, index_t src_thread_element_offset, bool src_thread_element_valid, index_t src_element_space_size)
template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true>
CK_TILE_DEVICE thread_buffer< T, N > amd_buffer_load_invalid_element_return_customized_value (const T *p_src_wave, index_t src_thread_element_offset, bool src_thread_element_valid, index_t src_element_space_size, T customized_value)
template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true, bool pre_nop = false>
CK_TILE_DEVICE void amd_buffer_load_raw (thread_buffer< T, N > &dst, const T *p_src_wave, index_t src_thread_element_offset, index_t src_linear_element_offset, index_t src_element_space_size, index_t is_valid_element=0, bool_constant< pre_nop >={})
template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true, bool pre_nop = false>
CK_TILE_DEVICE void amd_buffer_load_raw (thread_buffer< T, N > &dst, const int32x4_t src_wave_buffer_resource, index_t src_thread_element_offset, index_t src_linear_element_offset, index_t is_valid_element=0, bool_constant< pre_nop >={})
template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool pre_nop = false>
CK_TILE_DEVICE void amd_async_buffer_load_with_oob_raw (T *smem, const T *p_src_wave, index_t src_thread_element_offset, index_t src_linear_element_offset, index_t src_element_space_size, bool_constant< pre_nop >={})
template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool pre_nop = false>
CK_TILE_DEVICE void amd_async_buffer_load_with_oob_raw (T *smem, const int32x4_t src_wave_buffer_resource, index_t src_thread_element_offset, index_t src_linear_element_offset, bool_constant< pre_nop >={})
template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = false>
CK_TILE_DEVICE void amd_async_buffer_load_with_oob (CK_TILE_LDS_ADDR T *smem, const int32x4_t src_wave_buffer_resource, index_t src_thread_element_offset, index_t src_linear_element_offset, bool is_valid_element, bool_constant< oob_conditional_check >={})
template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true>
CK_TILE_DEVICE void amd_buffer_store (const thread_buffer< T, N > &src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true>
CK_TILE_DEVICE void amd_buffer_store_raw (const thread_buffer< T, N > &src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const index_t dst_linear_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
template<typename T, index_t N>
CK_TILE_DEVICE void amd_buffer_atomic_add (const thread_buffer< T, N > &src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true, bool pre_nop = false>
CK_TILE_DEVICE void amd_buffer_atomic_add_raw (const thread_buffer< T, N > &src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const index_t dst_linear_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size, bool_constant< pre_nop >={})
template<typename T, index_t N>
CK_TILE_DEVICE void amd_buffer_atomic_max (const thread_buffer< T, N > &src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
template<typename T, index_t LaneGroupSize, index_t kOuterDistDim0, index_t kOuterDistDim1, index_t kInnerDistDim0, index_t kInnerDistDim1>
CK_TILE_DEVICE constexpr auto make_transposed_distr_encode ()
CK_TILE_HOST_DEVICE constexpr index_t get_warp_size ()
CK_TILE_HOST bool is_wave32 ()
CK_TILE_DEVICE index_t get_grid_size ()
CK_TILE_DEVICE index_t get_block_size ()
CK_TILE_DEVICE index_t get_thread_local_1d_id ()
CK_TILE_DEVICE index_t get_thread_global_1d_id ()
CK_TILE_DEVICE index_t get_block_1d_id ()
CK_TILE_DEVICE index_t get_lane_id ()
template<bool ReturnSgpr = true>
CK_TILE_DEVICE index_t get_warp_id (bool_constant< ReturnSgpr >={})
CK_TILE_DEVICE index_t get_thread_id ()
CK_TILE_DEVICE index_t get_block_id ()
CK_TILE_DEVICE void block_sync_load_raw (index_t cnt=0)
template<index_t vmcnt = waitcnt_arg::kMaxVmCnt, index_t expcnt = waitcnt_arg::kMaxExpCnt, index_t lgkmcnt = waitcnt_arg::kMaxLgkmCnt>
CK_TILE_DEVICE void s_waitcnt ()
template<index_t vmcnt = waitcnt_arg::kMaxVmCnt, index_t expcnt = waitcnt_arg::kMaxExpCnt, index_t lgkmcnt = waitcnt_arg::kMaxLgkmCnt>
CK_TILE_DEVICE void s_waitcnt_barrier ()
template<index_t lgkmcnt = 0>
CK_TILE_DEVICE void block_sync_lds ()
template<index_t vmcnt = 0>
CK_TILE_DEVICE void block_sync_lds_direct_load ()
CK_TILE_DEVICE void s_nop (index_t cnt=0)
template<typename T>
__device__ T * cast_pointer_to_generic_address_space (T CK_CONSTANT_ADDRESS_SPACE *p)
template<typename T>
__host__ __device__ T CK_CONSTANT_ADDRESS_SPACEcast_pointer_to_constant_address_space (T *p)
CK_TILE_HOST_DEVICE constexpr index_t get_smem_capacity ()
CK_TILE_HOST_DEVICE constexpr const char * address_space_to_string (address_space_enum addr_space)
 Helper function to convert address space enum to string.
template<typename T, typename ComputeType>
CK_TILE_HOST_DEVICEadd (const T &a, const T &b)
CK_TILE_HOST_DEVICE bf16x2_t add_bf16x2_t (const bf16x2_t &a, const bf16x2_t &b)
CK_TILE_HOST_DEVICE bf16x4_t add_bf16x4_t (const bf16x4_t &a, const bf16x4_t &b)
CK_TILE_HOST_DEVICE fp16x2_t add_f16x2_t (const fp16x2_t &a, const fp16x2_t &b)
CK_TILE_HOST_DEVICE fp8x4_t add_fp8x4_t (const fp8x4_t &a, const fp8x4_t &b)
CK_TILE_HOST_DEVICE fp8x8_t add_fp8x8_t (const fp8x8_t &a, const fp8x8_t &b)
CK_TILE_HOST_DEVICE bf8x4_t add_bf8x4_t (const bf8x4_t &a, const bf8x4_t &b)
CK_TILE_HOST_DEVICE bf8x8_t add_bf8x8_t (const bf8x8_t &a, const bf8x8_t &b)
template<typename X>
CK_TILE_DEVICE void atomic_add (X *p_dst, const X &x)
template<>
CK_TILE_DEVICE void atomic_add< bf16x2_t > (bf16x2_t *p_dst, const bf16x2_t &x)
template<>
CK_TILE_DEVICE void atomic_add< bf16x4_t > (bf16x4_t *p_dst, bf16x4_t const &x)
template<>
CK_TILE_DEVICE void atomic_add< fp8x4_t > (fp8x4_t *p_dst, const fp8x4_t &x)
template<>
CK_TILE_DEVICE void atomic_add< bf8x4_t > (bf8x4_t *p_dst, const bf8x4_t &x)
template<>
CK_TILE_DEVICE void atomic_add< fp8x8_t > (fp8x8_t *p_dst, fp8x8_t const &x)
template<>
CK_TILE_DEVICE void atomic_add< bf8x8_t > (bf8x8_t *p_dst, bf8x8_t const &x)
template<>
CK_TILE_DEVICE void atomic_add< fp16x2_t > (fp16x2_t *p_dst, fp16x2_t const &x)
template<typename T, index_t N>
CK_TILE_DEVICE void atomic_add_g (T *p_dst, const thread_buffer< T, N > &x)
template<typename T, index_t N>
CK_TILE_DEVICE void atomic_max_g (T *p_dst, const thread_buffer< T, N > &x)
CK_TILE_DEVICE void m0_set_with_memory (index_t v)
CK_TILE_DEVICE void m0_inc_with_memory (index_t v)
template<typename T>
CK_TILE_DEVICEwarp_shuffle_up (const T &v_local, uint32_t lane_delta)
template<typename T>
CK_TILE_DEVICEwarp_shuffle_down (const T &v_local, uint32_t lane_delta)
template<typename T>
CK_TILE_DEVICE auto warp_shuffle_down_pair (const T &v_local)
template<typename T>
CK_TILE_DEVICEwarp_shuffle (const T &v_local, uint32_t src_lane)
template<typename T>
CK_TILE_DEVICE auto flag_to_exec (const T &v_flag)
template<typename X, typename Y>
CK_TILE_DEVICE auto cmp_lt_to_exec (const X &x, const Y &y)
template<typename D = void, typename... Ts>
CK_TILE_HOST_DEVICE constexpr details::return_type< D, Ts... > make_array (Ts &&... ts)
template<typename T, index_t Size>
CK_TILE_HOST_DEVICE constexpr auto make_array_with (std::initializer_list< T > ilist)
template<typename T, index_t Size>
CK_TILE_HOST_DEVICE constexpr bool operator== (const array< T, Size > &a, const array< T, Size > &b)
template<typename T, index_t Size>
CK_TILE_HOST_DEVICE constexpr bool operator!= (const array< T, Size > &a, const array< T, Size > &b)
template<typename T, index_t N, typename X>
CK_TILE_HOST_DEVICE constexpr auto to_array (const std::vector< X > &x)
template<typename T, index_t N, typename X>
CK_TILE_HOST_DEVICE constexpr auto to_array (const X &x)
template<typename TData, index_t NSize>
CK_TILE_HOST_DEVICE constexpr auto container_push_back (const array< TData, NSize > &a, const TData &x)
template<typename... Ts, typename T>
CK_TILE_HOST_DEVICE constexpr auto container_push_front (const tuple< Ts... > &a, const T &x)
template<typename... Ts, typename T>
CK_TILE_HOST_DEVICE constexpr auto container_push_back (const tuple< Ts... > &a, const T &x)
template<typename TData, index_t NSize, index_t... IRs>
CK_TILE_HOST_DEVICE constexpr auto container_reorder_given_new2old (const array< TData, NSize > &old_array, sequence< IRs... >)
template<typename TData, index_t NSize, index_t... IRs>
CK_TILE_HOST_DEVICE constexpr auto container_reorder_given_old2new (const array< TData, NSize > &old_array, sequence< IRs... > old2new)
template<typename TData, index_t NSize>
CK_TILE_HOST_DEVICE constexpr auto container_reorder_given_new2old (const array< TData, NSize > &old_array, const map< index_t, index_t > &new2old)
template<typename TData, index_t NSize>
CK_TILE_HOST_DEVICE constexpr auto container_reorder_given_old2new (const array< TData, NSize > &old_array, const map< index_t, index_t > &old2new)
template<typename... Ts, index_t... IRs>
CK_TILE_HOST_DEVICE constexpr auto container_reorder_given_new2old (const tuple< Ts... > &old_tuple, sequence< IRs... >)
template<typename... Ts, index_t... IRs>
CK_TILE_HOST_DEVICE constexpr auto container_reorder_given_old2new (const tuple< Ts... > &old_tuple, sequence< IRs... > old2new)
template<index_t... Is, index_t... IRs>
CK_TILE_HOST_DEVICE constexpr auto container_reorder_given_new2old (sequence< Is... >, sequence< IRs... >)
template<index_t... Is, index_t... IRs>
CK_TILE_HOST_DEVICE constexpr auto container_reorder_given_old2new (sequence< Is... > old_seq, sequence< IRs... >)
template<typename Container, typename Reduce, typename ROld, index_t I, index_t IEnd, index_t IStep>
CK_TILE_HOST_DEVICE constexpr auto container_reduce_impl (const Container &x, Reduce reduce, ROld r_old, number< I > i, number< IEnd >, number< IStep >)
template<typename Container, typename Reduce, typename Init, index_t IBegin = 0, index_t IEnd = Container::size(), index_t IStep = 1>
CK_TILE_HOST_DEVICE constexpr auto container_reduce (const Container &x, Reduce reduce, Init init, number< IBegin >=number< 0 >{}, number< IEnd >=number< Container::size()>{}, number< IStep >=number< 1 >{})
template<typename TData, index_t NSize, typename Reduce>
CK_TILE_HOST_DEVICE constexpr auto container_reverse_inclusive_scan (const array< TData, NSize > &x, Reduce f, TData init)
template<typename TData, index_t NSize, typename Reduce, typename Init>
CK_TILE_HOST_DEVICE constexpr auto container_reverse_exclusive_scan (const array< TData, NSize > &x, Reduce f, Init init)
template<index_t... Is, typename Reduce, index_t Init>
CK_TILE_HOST_DEVICE constexpr auto container_reverse_exclusive_scan (const sequence< Is... > &seq, Reduce f, number< Init >)
template<typename... Xs, typename Reduce, index_t I, typename YOld, typename ROld>
CK_TILE_HOST_DEVICE constexpr auto container_reverse_exclusive_scan_impl (const tuple< Xs... > &x, Reduce reduce, number< I > i, YOld y_old, ROld r_old)
template<typename... Xs, typename Reduce, typename Init>
CK_TILE_HOST_DEVICE constexpr auto container_reverse_exclusive_scan (const tuple< Xs... > &x, Reduce reduce, Init init)
template<typename... Xs, typename Reduce, typename TData>
CK_TILE_HOST_DEVICE constexpr auto container_reverse_inclusive_scan (const tuple< Xs... > &x, Reduce f, TData init)
template<typename X, typename... Ys>
CK_TILE_HOST_DEVICE constexpr auto container_concat (const X &x, const Ys &... ys)
template<typename T, index_t NX, index_t NY>
CK_TILE_HOST_DEVICE constexpr auto container_concat (const array< T, NX > &ax, const array< T, NY > &ay)
template<typename... X, typename... Y>
CK_TILE_HOST_DEVICE constexpr auto container_concat (const tuple< X... > &tx, const tuple< Y... > &ty)
template<typename Container>
CK_TILE_HOST_DEVICE constexpr auto container_concat (const Container &x)
template<typename T, index_t N, index_t... Is>
CK_TILE_HOST_DEVICE constexpr auto get_container_subset (const array< T, N > &arr, sequence< Is... >)
template<typename... Ts, index_t... Is>
CK_TILE_HOST_DEVICE constexpr auto get_container_subset (const tuple< Ts... > &tup, sequence< Is... >)
template<typename T, index_t N, index_t... Is>
CK_TILE_HOST_DEVICE constexpr void set_container_subset (array< T, N > &y, sequence< Is... > picks, const array< T, sizeof...(Is)> &x)
template<typename Y, typename X, index_t... Is>
CK_TILE_HOST_DEVICE constexpr void set_container_subset (Y &y, sequence< Is... > picks, const X &x)
template<index_t... Is>
constexpr index_t container_find (sequence< Is... > seq, index_t value)
template<index_t... Is>
CK_TILE_HOST_DEVICE constexpr auto sequence_to_tuple_of_number (sequence< Is... >)
template<typename... Xs>
CK_TILE_HOST_DEVICE constexpr auto make_multi_index (Xs &&... xs)
template<index_t NSize>
CK_TILE_HOST_DEVICE constexpr auto make_zero_multi_index ()
template<typename T>
CK_TILE_HOST_DEVICE constexpr auto to_multi_index (const T &x)
template<index_t NSize, typename X>
CK_TILE_HOST_DEVICE constexpr auto operator+= (multi_index< NSize > &y, const X &x)
template<index_t NSize, typename X>
CK_TILE_HOST_DEVICE constexpr auto operator-= (multi_index< NSize > &y, const X &x)
template<index_t NSize, typename T>
CK_TILE_HOST_DEVICE constexpr auto operator+ (const multi_index< NSize > &a, const T &b)
template<index_t NSize, typename T>
CK_TILE_HOST_DEVICE constexpr auto operator- (const multi_index< NSize > &a, const T &b)
template<index_t NSize, typename T>
CK_TILE_HOST_DEVICE constexpr auto operator* (const multi_index< NSize > &a, const T &b)
template<index_t NSize>
CK_TILE_HOST_DEVICE constexpr auto operator* (index_t a, const multi_index< NSize > &x)
template<index_t NSize>
CK_TILE_HOST_DEVICE constexpr auto operator* (const multi_index< NSize > &x, index_t a)
template<index_t I, index_t... Is>
CK_TILE_HOST_DEVICE constexpr auto sequence_pop_front (sequence< I, Is... >)
template<typename Seq>
CK_TILE_HOST_DEVICE constexpr auto sequence_pop_back (Seq)
template<index_t... Xs, index_t... Ys>
CK_TILE_HOST_DEVICE constexpr bool operator== (sequence< Xs... >, sequence< Ys... >)
template<index_t... Xs, index_t... Ys>
CK_TILE_HOST_DEVICE constexpr bool operator!= (sequence< Xs... > x, sequence< Ys... > y)
template<index_t... Xs, index_t... Ys>
CK_TILE_HOST_DEVICE constexpr auto operator+ (sequence< Xs... >, sequence< Ys... >)
template<index_t... Xs, index_t... Ys>
CK_TILE_HOST_DEVICE constexpr auto operator- (sequence< Xs... >, sequence< Ys... >)
template<index_t... Xs, index_t... Ys>
CK_TILE_HOST_DEVICE constexpr auto operator* (sequence< Xs... >, sequence< Ys... >)
template<index_t... Xs, index_t... Ys>
CK_TILE_HOST_DEVICE constexpr auto operator/ (sequence< Xs... >, sequence< Ys... >)
template<index_t... Xs, index_t... Ys>
CK_TILE_HOST_DEVICE constexpr auto operator% (sequence< Xs... >, sequence< Ys... >)
template<index_t... Xs, index_t Y>
CK_TILE_HOST_DEVICE constexpr auto operator+ (sequence< Xs... >, number< Y >)
template<index_t... Xs, index_t Y>
CK_TILE_HOST_DEVICE constexpr auto operator- (sequence< Xs... >, number< Y >)
template<index_t... Xs, index_t Y>
CK_TILE_HOST_DEVICE constexpr auto operator* (sequence< Xs... >, number< Y >)
template<index_t... Xs, index_t Y>
CK_TILE_HOST_DEVICE constexpr auto operator/ (sequence< Xs... >, number< Y >)
template<index_t... Xs, index_t Y>
CK_TILE_HOST_DEVICE constexpr auto operator% (sequence< Xs... >, number< Y >)
template<index_t Y, index_t... Xs>
CK_TILE_HOST_DEVICE constexpr auto operator+ (number< Y >, sequence< Xs... >)
template<index_t Y, index_t... Xs>
CK_TILE_HOST_DEVICE constexpr auto operator- (number< Y >, sequence< Xs... >)
template<index_t Y, index_t... Xs>
CK_TILE_HOST_DEVICE constexpr auto operator* (number< Y >, sequence< Xs... >)
template<index_t Y, index_t... Xs>
CK_TILE_HOST_DEVICE constexpr auto operator/ (number< Y >, sequence< Xs... >)
template<index_t Y, index_t... Xs>
CK_TILE_HOST_DEVICE constexpr auto operator% (number< Y >, sequence< Xs... >)
template<typename... Seqs>
CK_TILE_HOST_DEVICE constexpr auto merge_sequences (Seqs...)
template<typename F, index_t... Xs>
CK_TILE_HOST_DEVICE constexpr auto transform_sequences (F f, sequence< Xs... >)
template<typename F, index_t... Xs, index_t... Ys>
CK_TILE_HOST_DEVICE constexpr auto transform_sequences (F f, sequence< Xs... >, sequence< Ys... >)
template<typename F, index_t... Xs, index_t... Ys, index_t... Zs>
CK_TILE_HOST_DEVICE constexpr auto transform_sequences (F f, sequence< Xs... >, sequence< Ys... >, sequence< Zs... >)
template<typename Seq, typename Reduce, index_t Init>
CK_TILE_HOST_DEVICE constexpr auto reverse_inclusive_scan_sequence (Seq, Reduce, number< Init >)
template<typename Seq, typename Reduce, index_t Init>
CK_TILE_HOST_DEVICE constexpr auto reverse_exclusive_scan_sequence (Seq, Reduce, number< Init >)
template<typename Seq, typename Reduce, index_t Init>
CK_TILE_HOST_DEVICE constexpr auto inclusive_scan_sequence (Seq, Reduce, number< Init >)
template<typename Seq, typename Reduce, index_t Init>
constexpr auto exclusive_scan_sequence (Seq, Reduce, number< Init >)
template<typename Seq>
constexpr auto prefix_sum_sequence (Seq)
template<typename Seq, index_t... Is>
CK_TILE_HOST_DEVICE constexpr auto pick_sequence_elements_by_ids (Seq, sequence< Is... >)
template<typename Seq, typename Mask>
CK_TILE_HOST_DEVICE constexpr auto pick_sequence_elements_by_mask (Seq, Mask)
template<typename Seq, typename Values, typename Ids>
CK_TILE_HOST_DEVICE constexpr auto modify_sequence_elements_by_ids (Seq, Values, Ids)
template<typename Seq, typename Reduce, index_t Init>
CK_TILE_HOST_DEVICE constexpr index_t reduce_on_sequence (Seq, Reduce f, number< Init >)
template<typename Seq, typename F>
CK_TILE_HOST_DEVICE constexpr bool sequence_any_of (Seq, F f)
template<typename Seq, typename F>
CK_TILE_HOST_DEVICE constexpr bool sequence_all_of (Seq, F f)
template<index_t... Is>
CK_TILE_HOST_DEVICE constexpr auto make_sequence (number< Is >...)
template<typename F, index_t N>
CK_TILE_HOST_DEVICE constexpr auto generate_sequence (F, number< N >)
template<typename F, index_t N>
CK_TILE_HOST_DEVICE constexpr auto generate_sequence_v2 (F &&f, number< N >)
template<index_t... Is>
CK_TILE_HOST_DEVICE constexpr auto to_sequence (tuple< number< Is >... >)
template<typename SeqSortedSamples, index_t r, index_t... rs>
CK_TILE_HOST_DEVICE constexpr auto histogram_sorted_sequence (SeqSortedSamples, sequence< r, rs... >)
template<typename F, index_t N>
CK_TILE_HOST_DEVICE constexpr auto generate_array (F &&f, number< N >)
template<typename Seq, index_t SliceSize, typename Mask = typename uniform_sequence_gen<Seq::size(), 1>::type>
constexpr auto reverse_slice_sequence (Seq, number< SliceSize >, Mask=typename uniform_sequence_gen< Seq::size(), 1 >::type{})
template<typename Seq, index_t SliceSize, typename Mask = typename uniform_sequence_gen<Seq::size(), 1>::type>
constexpr auto slice_sequence (Seq, number< SliceSize >, Mask=typename uniform_sequence_gen< Seq::size(), 1 >::type{})
template<typename... Ts>
CK_TILE_HOST_DEVICE constexpr auto make_thread_buffer (Ts &&... ts)
template<typename... T>
CK_TILE_HOST_DEVICE void print (const tuple< T... > &t)
template<typename... Xs>
CK_TILE_HOST_DEVICE constexpr bool operator== (const tuple< Xs... > &a, const tuple< Xs... > &b)
template<typename... Xs>
CK_TILE_HOST_DEVICE constexpr bool operator!= (const tuple< Xs... > &a, const tuple< Xs... > &b)
template<typename... Xs>
CK_TILE_HOST_DEVICE constexpr auto make_tuple (Xs &&... xs)
template<typename... Args>
constexpr tuple< Args &... > tie (Args &... args) noexcept
template<typename F, index_t... ids>
CK_TILE_HOST_DEVICE constexpr auto generate_tuple_for (F &&f, sequence< ids... >)
template<typename F, index_t N>
CK_TILE_HOST_DEVICE constexpr auto generate_tuple (F &&f, number< N >)
template<typename F, index_t N>
CK_TILE_HOST_DEVICE constexpr auto generate_tie (F &&f, number< N >)
template<typename... X, typename... Y>
CK_TILE_HOST_DEVICE constexpr auto concat_tuple_of_reference (const tuple< X &... > &tx, const tuple< Y &... > &ty)
template<typename... X, typename... Y>
CK_TILE_HOST_DEVICE constexpr auto concat_tuple (const tuple< X... > &tx, const tuple< Y... > &ty)
template<typename... X>
CK_TILE_HOST_DEVICE constexpr auto concat_tuple (const tuple< X... > &tx)
template<typename... X, typename... Tuples>
CK_TILE_HOST_DEVICE constexpr auto concat_tuple (const tuple< X... > &tx, const Tuples &... tuples)
template<typename F, typename X>
CK_TILE_HOST_DEVICE constexpr auto transform_tuples (F f, const X &x)
template<typename F, typename X, typename Y>
CK_TILE_HOST_DEVICE constexpr auto transform_tuples (F f, const X &x, const Y &y)
template<typename F, typename X, typename Y, typename Z>
CK_TILE_HOST_DEVICE constexpr auto transform_tuples (F f, const X &x, const Y &y, const Z &z)
template<typename F, typename Tuple>
constexpr decltype(auto) apply (F &&f, Tuple &&t)
template<typename F, typename X>
CK_TILE_HOST_DEVICE constexpr auto embed_tuples (F f, const X &x)
template<index_t Depth = 0, index_t MaxDepth = -1>
CK_TILE_HOST_DEVICE constexpr auto unroll_nested_tuple (const tuple<> &t)
template<index_t Depth = 0, index_t MaxDepth = -1, typename T>
CK_TILE_HOST_DEVICE constexpr auto unroll_nested_tuple (const T &t)
template<index_t Depth = 0, index_t MaxDepth = -1, typename... Ts>
CK_TILE_HOST_DEVICE constexpr auto unroll_nested_tuple (const tuple< Ts... > &t)
template<typename... Ts>
CK_TILE_HOST_DEVICE constexpr auto tuple_reverse (const tuple< Ts... > &t)
template<index_t Idx, index_t End, typename F, typename... Ts>
CK_TILE_HOST_DEVICE constexpr auto tuple_reduce (F &&f, const tuple< Ts... > &t)
template<typename... Ts>
CK_TILE_HOST_DEVICE constexpr auto is_nested_tuple (const tuple< Ts... > &)
template<index_t depth = 0, typename T>
CK_TILE_HOST_DEVICE constexpr auto tuple_depth (const T &)
template<index_t depth = 0, typename... Ts>
CK_TILE_HOST_DEVICE constexpr auto tuple_depth (const tuple< Ts... > &)
template<typename... Seqs>
CK_TILE_HOST_DEVICE constexpr auto to_array_of_array (tuple< Seqs... > t_of_s)
template<typename... Ys, typename X, std::enable_if_t<!std::is_integral< X >::value &&!std::is_floating_point< X >::value, bool > = false>
CK_TILE_HOST_DEVICE constexpr auto operator+= (tuple< Ys... > &y, const X &x)
template<typename... Ys, typename X, std::enable_if_t<!std::is_integral< X >::value &&!std::is_floating_point< X >::value, bool > = false>
CK_TILE_HOST_DEVICE constexpr auto operator-= (tuple< Ys... > &y, const X &x)
template<typename... Xs, typename Y, std::enable_if_t<!std::is_integral< Y >::value &&!std::is_floating_point< Y >::value, bool > = false>
CK_TILE_HOST_DEVICE constexpr auto operator+ (const tuple< Xs... > &x, const Y &y)
template<typename... Xs, typename... Ys>
CK_TILE_HOST_DEVICE constexpr auto operator+ (const tuple< Xs... > &x, const tuple< Ys... > &y)
template<typename... Xs, typename Y, std::enable_if_t<!std::is_integral< Y >::value &&!std::is_floating_point< Y >::value, bool > = false>
CK_TILE_HOST_DEVICE constexpr auto operator- (const tuple< Xs... > &x, const Y &y)
template<typename... Xs, typename... Ys>
CK_TILE_HOST_DEVICE constexpr auto operator- (const tuple< Xs... > &x, const tuple< Ys... > &y)
template<typename... Xs, typename Y, std::enable_if_t<!std::is_integral< Y >::value &&!std::is_floating_point< Y >::value, bool > = false>
CK_TILE_HOST_DEVICE constexpr auto operator* (const tuple< Xs... > &x, const Y &y)
template<typename... Xs, typename Y, std::enable_if_t< std::is_integral< Y >::value||std::is_floating_point< Y >::value, bool > = false>
CK_TILE_HOST_DEVICE constexpr auto operator* (Y a, const tuple< Xs... > &x)
template<typename... Xs, typename Y, std::enable_if_t< std::is_integral< Y >::value||std::is_floating_point< Y >::value, bool > = false>
CK_TILE_HOST_DEVICE constexpr auto operator* (const tuple< Xs... > &x, Y a)
template<typename... Xs, typename... Ys>
CK_TILE_HOST_DEVICE constexpr auto operator* (const tuple< Xs... > &x, const tuple< Ys... > &y)
template<typename... Xs, typename... Ys>
CK_TILE_HOST_DEVICE constexpr auto operator/ (const tuple< Xs... > &x, const tuple< Ys... > &y)
template<bf16_rounding_mode rounding = static_cast<bf16_rounding_mode>(CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT)>
CK_TILE_HOST_DEVICE constexpr uint16_t float_to_bf16_raw (float f, constant< rounding >={})
template<bf16_rounding_mode rounding = static_cast<bf16_rounding_mode>(CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT)>
CK_TILE_HOST_DEVICE constexpr uint16_t double_to_bf16_raw (double f, constant< rounding >={})
CK_TILE_HOST_DEVICE constexpr float bf16_to_float_raw (uint16_t x)
CK_TILE_HOST_DEVICE constexpr double bf16_to_double_raw (uint16_t x)
CK_TILE_HOST_DEVICE constexpr uint16_t float_to_bf16_rtn_raw (float f)
CK_TILE_HOST constexpr uint16_t float_to_bf16_rtn_asm (float f)
CK_TILE_HOST uint16_t float_to_bf16_rta_asm (float f)
CK_TILE_HOST_DEVICE constexpr uint16_t float_to_bf16_truc_nan_raw (float f)
CK_TILE_HOST_DEVICE constexpr uint16_t float_to_bf16_truc_raw (float f)
template<bf16_rounding_mode rounding = static_cast<bf16_rounding_mode>(CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT)>
CK_TILE_HOST_DEVICE constexpr bfloat16_t float_to_bf16 (float f, constant< rounding >={})
template<bf16_rounding_mode rounding = static_cast<bf16_rounding_mode>(CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT)>
CK_TILE_HOST_DEVICE constexpr bfloat16_t double_to_bf16 (double f, constant< rounding >={})
CK_TILE_HOST_DEVICE constexpr float bf16_to_float (bfloat16_t x)
CK_TILE_HOST_DEVICE constexpr double bf16_to_double (bfloat16_t x)
template<bf16_rounding_mode rounding = static_cast<bf16_rounding_mode>(CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT)>
CK_TILE_HOST_DEVICE bfloat16_t constexpr fp16_to_bf16 (half_t f, constant< rounding >={})
CK_TILE_HOST_DEVICE constexpr half_t bf16_to_fp16 (bfloat16_t x)
CK_TILE_HOST_DEVICE bfloat16_t abs (const bfloat16_t &x)
CK_TILE_HOST_DEVICE bool isnan (const bfloat16_t &x)
CK_TILE_DEVICE bfloat16_t sqrt (bfloat16_t x)
CK_TILE_DEVICE bfloat16_t exp (bfloat16_t x)
CK_TILE_DEVICE bfloat16_t exp2 (bfloat16_t x)
CK_TILE_DEVICE bfloat16_t log (bfloat16_t x)
template<fp8_rounding_mode rounding = static_cast<fp8_rounding_mode>(CK_TILE_FLOAT_TO_FP8_DEFAULT)>
CK_TILE_HOST_DEVICE uint8_t float_to_fp8_raw (float, constant< rounding >={})
template<fp8_rounding_mode rounding = static_cast<fp8_rounding_mode>(CK_TILE_FLOAT_TO_FP8_DEFAULT)>
CK_TILE_HOST_DEVICE uint8_t float_to_bf8_raw (float, constant< rounding >={})
CK_TILE_HOST_DEVICE float fp8_to_float_raw (uint8_t)
CK_TILE_HOST_DEVICE float bf8_to_float_raw (uint8_t)
template<typename SrcT, typename DstT>
CK_TILE_HOST_DEVICE numeric_traits< DstT >::bitwise_type float_to_fp8_sr_raw (SrcT x)
 Converts a floating-point value to an 8-bit floating-point representation with stochastic rounding.
template<typename SrcT, typename DstT>
CK_TILE_HOST_DEVICE numeric_traits< DstT >::bitwise_type float_to_fp8_rtn_raw (SrcT x)
 Converts a floating-point value to an 8-bit floating-point representation with rounding to nearest even.
template<fp8_rounding_mode rounding>
CK_TILE_HOST_DEVICE fp8_raw_t float_to_fp8_raw (float x, constant< rounding >)
template<fp8_rounding_mode rounding>
CK_TILE_HOST_DEVICE bf8_raw_t float_to_bf8_raw (float x, constant< rounding >)
template<fp8_rounding_mode rounding = static_cast<fp8_rounding_mode>(CK_TILE_FLOAT_TO_FP8_DEFAULT)>
CK_TILE_HOST_DEVICE fp8_t float_to_fp8 (float x, constant< rounding >={})
template<fp8_rounding_mode rounding = static_cast<fp8_rounding_mode>(CK_TILE_FLOAT_TO_FP8_DEFAULT)>
CK_TILE_HOST_DEVICE bf8_t float_to_bf8 (float x, constant< rounding >={})
CK_TILE_HOST_DEVICE float fp8_to_float (fp8_t x)
CK_TILE_HOST_DEVICE float bf8_to_float (bf8_t x)
template<typename T>
CK_TILE_HOST_DEVICEabs (const T &x)
CK_TILE_HOST_DEVICE bool isnan (const fp8_t &x)
CK_TILE_HOST_DEVICE bool isnan (const bf8_t &x)
CK_TILE_HOST_DEVICE constexpr float fp16_to_float_hip (const fp16_hip_t &x)
CK_TILE_HOST_DEVICE constexpr double fp16_to_double_hip (const fp16_hip_t &x)
CK_TILE_HOST_DEVICE constexpr fp16_hip_t float_to_fp16_hip (const float &x)
CK_TILE_HOST_DEVICE constexpr fp16_hip_t double_to_fp16_hip (const double &x)
CK_TILE_HOST_DEVICE constexpr float fp16_to_float (const half_t &x)
CK_TILE_HOST_DEVICE constexpr float fp16_to_double (const half_t &x)
CK_TILE_HOST_DEVICE constexpr half_t float_to_fp16 (const float &x)
CK_TILE_HOST_DEVICE constexpr half_t double_to_fp16 (const double &x)
CK_TILE_HOST fp16x2_t pk_add_f16 (const fp16x2_t &x, const fp16x2_t &y)
CK_TILE_HOST_DEVICE constexpr float int8_to_float (const int8_t &x)
CK_TILE_HOST_DEVICE constexpr int8_t float_to_int8 (const float &x)
template<typename Scale>
__host__ __device__ scales (Scale) -> scales< Scale >
 FIXME: create macro to replace 'host device' and nothing more.
__host__ __device__ plus () -> plus< void, void >
 FIXME: create macro to replace 'host device' and nothing more.
__host__ __device__ minus () -> minus< void, void >
 FIXME: create macro to replace 'host device' and nothing more.
__host__ __device__ multiplies () -> multiplies< void, void >
 FIXME: create macro to replace 'host device' and nothing more.
template<typename X, typename Y>
CK_TILE_HOST_DEVICE constexpr auto integer_divide_floor (X x, Y y)
template<typename X, typename Y>
CK_TILE_HOST_DEVICE constexpr auto integer_divide_ceil (X x, Y y)
template<typename X, typename Y>
CK_TILE_HOST_DEVICE constexpr auto integer_least_multiple (X x, Y y)
template<typename T>
CK_TILE_HOST_DEVICE constexpr T max (T x)
template<typename T>
CK_TILE_HOST constexpr T max (T x, T y)
template<typename T>
CK_TILE_DEVICE constexpr T max (T x, T y)
template<>
CK_TILE_DEVICE constexpr float max (float x, float y)
template<>
CK_TILE_DEVICE constexpr double max (double x, double y)
template<index_t X>
CK_TILE_HOST_DEVICE constexpr index_t max (number< X >, index_t y)
template<index_t Y>
CK_TILE_HOST_DEVICE constexpr index_t max (index_t x, number< Y >)
template<typename X, typename... Ys>
CK_TILE_HOST_DEVICE constexpr auto max (X x, Ys... ys)
template<typename T>
CK_TILE_HOST_DEVICE constexpr T min (T x)
template<typename T>
CK_TILE_HOST constexpr T min (T x, T y)
template<typename T>
CK_TILE_DEVICE constexpr T min (T x, T y)
template<>
CK_TILE_DEVICE constexpr float min (float x, float y)
template<>
CK_TILE_DEVICE constexpr double min (double x, double y)
template<index_t X>
CK_TILE_HOST_DEVICE constexpr index_t min (number< X >, index_t y)
template<index_t Y>
CK_TILE_HOST_DEVICE constexpr index_t min (index_t x, number< Y >)
template<typename X, typename... Ys>
CK_TILE_HOST_DEVICE constexpr auto min (X x, Ys... ys)
template<typename T>
CK_TILE_HOST_DEVICE constexpr T clamp (const T &x, const T &lowerbound, const T &upperbound)
CK_TILE_HOST int clz (uint32_t x)
CK_TILE_HOST_DEVICE constexpr index_t gcd (index_t x, index_t y)
template<index_t X, index_t Y>
CK_TILE_HOST_DEVICE constexpr auto gcd (number< X >, number< Y >)
template<typename X, typename... Ys, typename std::enable_if< sizeof...(Ys) > = 2, bool, ::type = false>
CK_TILE_HOST_DEVICE constexpr auto gcd (X x, Ys... ys)
template<typename X, typename Y>
CK_TILE_HOST_DEVICE constexpr auto lcm (X x, Y y)
template<typename X, typename... Ys, typename std::enable_if< sizeof...(Ys) > = 2, bool, ::type = false>
CK_TILE_HOST_DEVICE constexpr auto lcm (X x, Ys... ys)
__host__ __device__ equal () -> equal< void, void >
 FIXME: create macro to replace 'host device' and nothing more.
__host__ __device__ less () -> less< void, void >
 FIXME: create macro to replace 'host device' and nothing more.
__host__ __device__ less_equal () -> less_equal< void, void >
 FIXME: create macro to replace 'host device' and nothing more.
CK_TILE_HOST_DEVICE constexpr int32_t next_power_of_two (int32_t x)
template<index_t X>
CK_TILE_HOST_DEVICE constexpr auto next_power_of_two ()
template<index_t X>
CK_TILE_HOST_DEVICE constexpr auto next_power_of_two (number< X >)
CK_TILE_HOST_DEVICE constexpr int32_t integer_log2_floor (int32_t x)
CK_TILE_HOST_DEVICE constexpr bool is_power_of_two_integer (int32_t x)
CK_TILE_DEVICE float exp2 (float x)
CK_TILE_DEVICE uint16_t sad_u16 (uint16_t x, uint16_t y, uint16_t acc)
CK_TILE_DEVICE uint32_t sad_u32 (uint32_t x, uint32_t y, uint32_t acc)
CK_TILE_HOST float abs (float x)
CK_TILE_HOST double abs (double x)
CK_TILE_HOST int8_t abs (int8_t x)
CK_TILE_HOST int32_t abs (int32_t x)
CK_TILE_HOST fp16_t abs (fp16_t x)
CK_TILE_HOST bool isnan (float x)
CK_TILE_HOST bool isnan (double x)
CK_TILE_HOST bool isnan (int8_t x)
CK_TILE_HOST bool isnan (int32_t x)
CK_TILE_HOST bool isnan (fp16_t x)
CK_TILE_HOST fp16_t sqrt (fp16_t x)
CK_TILE_HOST float sqrt (float x)
CK_TILE_HOST double sqrt (double x)
template<typename T>
CK_TILE_HOSTtanh (T x)
template<>
CK_TILE_HOST float tanh< float > (float x)
template<>
CK_TILE_HOST double tanh< double > (double x)
template<typename T>
CK_TILE_HOSTacos (T x)
template<>
CK_TILE_HOST float acos< float > (float x)
template<>
CK_TILE_HOST double acos< double > (double x)
template<typename T>
CK_TILE_HOSTneg (T x)
template<>
CK_TILE_HOST float neg< float > (float x)
template<>
CK_TILE_HOST double neg< double > (double x)
template<>
CK_TILE_HOST int32_t neg< int32_t > (int32_t x)
template<>
CK_TILE_HOST int8_t neg< int8_t > (int8_t x)
template<typename T>
CK_TILE_HOSTatan (T x)
template<>
CK_TILE_HOST float atan< float > (float x)
template<>
CK_TILE_HOST double atan< double > (double x)
template<typename T>
CK_TILE_HOSTsin (T x)
template<>
CK_TILE_HOST float sin< float > (float x)
template<>
CK_TILE_HOST double sin< double > (double x)
template<typename T>
CK_TILE_HOSTasin (T x)
template<>
CK_TILE_HOST float asin< float > (float x)
template<>
CK_TILE_HOST double asin< double > (double x)
template<typename T>
CK_TILE_HOSTasinh (T x)
template<>
CK_TILE_HOST float asinh< float > (float x)
template<>
CK_TILE_HOST double asinh< double > (double x)
template<typename T>
CK_TILE_HOSTcos (T x)
template<>
CK_TILE_HOST float cos< float > (float x)
template<>
CK_TILE_HOST double cos< double > (double x)
template<typename T>
CK_TILE_HOSTacosh (T x)
template<>
CK_TILE_HOST float acosh< float > (float x)
template<>
CK_TILE_HOST double acosh< double > (double x)
template<typename T>
CK_TILE_HOSTtan (T x)
template<>
CK_TILE_HOST float tan< float > (float x)
template<>
CK_TILE_HOST double tan< double > (double x)
template<typename T>
CK_TILE_HOSTatanh (T x)
template<>
CK_TILE_HOST float atanh< float > (float x)
template<>
CK_TILE_HOST double atanh< double > (double x)
template<typename T>
CK_TILE_HOSTsinh (T x)
template<>
CK_TILE_HOST float sinh< float > (float x)
template<>
CK_TILE_HOST double sinh< double > (double x)
template<typename T>
CK_TILE_HOSTceil (T x)
template<>
CK_TILE_HOST float ceil< float > (float x)
template<>
CK_TILE_HOST double ceil< double > (double x)
template<typename T>
CK_TILE_HOSTcosh (T x)
template<>
CK_TILE_HOST float cosh< float > (float x)
template<>
CK_TILE_HOST double cosh< double > (double x)
template<typename T>
CK_TILE_HOSTfloor (T x)
template<>
CK_TILE_HOST float floor< float > (float x)
template<>
CK_TILE_HOST double floor< double > (double x)
template<typename T>
CK_TILE_HOSTrcp (T x)
template<typename T>
CK_TILE_HOSTexp (T x)
template<>
CK_TILE_HOST float exp< float > (float x)
template<>
CK_TILE_HOST double exp< double > (double x)
template<typename T>
CK_TILE_HOSTlog (T x)
template<>
CK_TILE_HOST float log< float > (float x)
template<>
CK_TILE_HOST double log< double > (double x)
template<typename T>
CK_TILE_HOSTpow (T x, T gamma)
template<>
CK_TILE_HOST float pow< float > (float x, float gamma)
template<>
CK_TILE_HOST double pow< double > (double x, double gamma)
template<typename T>
CK_TILE_HOSTexpm1 (T x)
template<>
CK_TILE_HOST float expm1< float > (float x)
template<>
CK_TILE_HOST double expm1< double > (double x)
template<typename T>
CK_TILE_DEVICEtanh (T x)
template<>
CK_TILE_DEVICE float tanh< float > (float x)
template<>
CK_TILE_DEVICE double tanh< double > (double x)
template<typename T>
CK_TILE_DEVICEacos (T x)
template<>
CK_TILE_DEVICE float acos< float > (float x)
template<>
CK_TILE_DEVICE double acos< double > (double x)
template<typename T>
CK_TILE_DEVICEneg (T x)
template<>
CK_TILE_DEVICE float neg< float > (float x)
template<>
CK_TILE_DEVICE double neg< double > (double x)
template<>
CK_TILE_DEVICE int32_t neg< int32_t > (int32_t x)
template<>
CK_TILE_DEVICE int8_t neg< int8_t > (int8_t x)
template<>
CK_TILE_DEVICE fp16_t neg< fp16_t > (fp16_t x)
template<typename T>
CK_TILE_DEVICEatan (T x)
template<>
CK_TILE_DEVICE float atan< float > (float x)
template<>
CK_TILE_DEVICE double atan< double > (double x)
template<typename T>
CK_TILE_DEVICEsin (T x)
template<>
CK_TILE_DEVICE float sin< float > (float x)
template<>
CK_TILE_DEVICE double sin< double > (double x)
template<>
CK_TILE_DEVICE fp16_t sin< fp16_t > (fp16_t x)
template<typename T>
CK_TILE_DEVICEasin (T x)
template<>
CK_TILE_DEVICE float asin< float > (float x)
template<>
CK_TILE_DEVICE double asin< double > (double x)
template<typename T>
CK_TILE_DEVICEasinh (T x)
template<>
CK_TILE_DEVICE float asinh< float > (float x)
template<>
CK_TILE_DEVICE double asinh< double > (double x)
template<typename T>
CK_TILE_DEVICEacosh (T x)
template<>
CK_TILE_DEVICE float acosh< float > (float x)
template<>
CK_TILE_DEVICE double acosh< double > (double x)
template<typename T>
CK_TILE_DEVICEtan (T x)
template<>
CK_TILE_DEVICE float tan< float > (float x)
template<>
CK_TILE_DEVICE double tan< double > (double x)
template<typename T>
CK_TILE_DEVICEatanh (T x)
template<>
CK_TILE_DEVICE float atanh< float > (float x)
template<>
CK_TILE_DEVICE double atanh< double > (double x)
template<typename T>
CK_TILE_DEVICEsinh (T x)
template<>
CK_TILE_DEVICE float sinh< float > (float x)
template<>
CK_TILE_DEVICE double sinh< double > (double x)
template<typename T>
CK_TILE_DEVICEceil (T x)
template<>
CK_TILE_DEVICE float ceil< float > (float x)
template<>
CK_TILE_DEVICE double ceil< double > (double x)
template<>
CK_TILE_DEVICE fp16_t ceil< fp16_t > (fp16_t x)
template<typename T>
CK_TILE_DEVICEcosh (T x)
template<>
CK_TILE_DEVICE float cosh< float > (float x)
template<>
CK_TILE_DEVICE double cosh< double > (double x)
template<typename T>
CK_TILE_DEVICEfloor (T x)
template<>
CK_TILE_DEVICE float floor< float > (float x)
template<>
CK_TILE_DEVICE double floor< double > (double x)
template<>
CK_TILE_DEVICE fp16_t floor< fp16_t > (fp16_t x)
template<typename T>
CK_TILE_DEVICErcp (T x)
template<typename T>
CK_TILE_DEVICEexp (T x)
template<>
CK_TILE_DEVICE fp16_t exp< fp16_t > (fp16_t x)
template<>
CK_TILE_DEVICE float exp< float > (float x)
template<>
CK_TILE_DEVICE double exp< double > (double x)
template<typename T>
CK_TILE_DEVICEtanh_fast (T x)
template<>
CK_TILE_DEVICE float tanh_fast< float > (float x)
template<typename T>
CK_TILE_DEVICElog (T x)
template<>
CK_TILE_DEVICE fp16_t log< fp16_t > (fp16_t x)
template<>
CK_TILE_DEVICE float log< float > (float x)
template<>
CK_TILE_DEVICE double log< double > (double x)
template<typename T>
CK_TILE_DEVICEpow (T x, T gamma)
template<>
CK_TILE_DEVICE float pow< float > (float x, float gamma)
template<>
CK_TILE_DEVICE double pow< double > (double x, double gamma)
template<typename T>
CK_TILE_DEVICEexpm1 (T x)
template<>
CK_TILE_DEVICE float expm1< float > (float x)
template<>
CK_TILE_DEVICE double expm1< double > (double x)
template<typename T>
CK_TILE_HOST_DEVICE float convert_to_float (typename T::raw_type data, float scale=1.f)
template<typename T>
CK_TILE_HOST_DEVICE T::raw_type convert_to_type (float value, float scale=1.f)
CK_TILE_HOST_DEVICE constexpr pk_float4_e2m1_t float_to_pk_fp4 (const float &x, float scale=1.f)
CK_TILE_HOST_DEVICE constexpr pk_fp4_raw_t float_to_mxfp4 (float x, float scale)
CK_TILE_HOST_DEVICE constexpr pk_fp4_t fp16_to_pk_fp4 (const fp16_t &x, float scale)
CK_TILE_HOST_DEVICE constexpr pk_fp4_t bf16_to_pk_fp4 (const bf16_t &x, float scale)
CK_TILE_HOST_DEVICE constexpr pk_fp4_t fp16x2_to_pk_fp4 (const fp16x2_t &x, float scale)
CK_TILE_HOST_DEVICE constexpr pk_fp4_t bf16x2_to_pk_fp4 (const bf16x2_t &x, float scale)
CK_TILE_HOST_DEVICE constexpr pk_fp4_t fp32x2_to_pk_fp4 (const fp32x2_t &x, float scale)
CK_TILE_HOST_DEVICE constexpr fp32x2_t pk_fp4_to_fp32x2 (const pk_fp4_t &x, float scale)
CK_TILE_HOST_DEVICE constexpr fp16x2_t pk_fp4_to_fp16x2 (const pk_fp4_t &x, float scale)
CK_TILE_HOST_DEVICE constexpr bf16x2_t pk_fp4_to_bf16x2 (const pk_fp4_t &x, float scale)
CK_TILE_HOST_DEVICE constexpr float pk_fp4_to_float (const pk_fp4_t &x, float scale)
CK_TILE_HOST_DEVICE constexpr fp16_t pk_fp4_to_fp16 (const pk_fp4_t &x, float scale)
CK_TILE_HOST_DEVICE constexpr bf16_t pk_fp4_to_bf16 (const pk_fp4_t &x, float scale)
CK_TILE_HOST_DEVICE fp32x2_t pk_int4_t_to_fp32x2_t (const pk_int4_t &x)
CK_TILE_HOST_DEVICE fp32x2_t pk_int4_t_to_fp32x2_t_signed_conversion (const pk_int4_t &x)
CK_TILE_HOST_DEVICE fp16x2_t pk_int4_t_to_halfx2_t (const pk_int4_t &x)
CK_TILE_HOST_DEVICE bf16x2_t pk_int4_t_to_bfloat16x2_t (const pk_int4_t &x)
CK_TILE_HOST_DEVICE int8x2_t pk_int4_t_to_int8x2_t (const pk_int4_t &x)
template<typename Y, typename X, std::enable_if_t<!(std::is_const_v< Y >||std::is_const_v< X >), bool > = false>
CK_TILE_HOST_DEVICE constexpr Y type_convert (X x)
template<typename Y, typename X, std::enable_if_t< std::is_const_v< Y >||std::is_const_v< X >, bool > = false>
CK_TILE_HOST_DEVICE constexpr Y type_convert (X x)
template<typename Y, typename X>
CK_TILE_HOST_DEVICE constexpr Y scaled_type_convert (X x, float scale)
template<address_space_enum BufferAddressSpace, amd_buffer_coherence_enum Coherence = amd_buffer_coherence_enum::coherence_default, typename T, typename BufferSizeType>
CK_TILE_HOST_DEVICE constexpr auto make_buffer_view (T *__restrict__ p, BufferSizeType buffer_size)
template<address_space_enum BufferAddressSpace, amd_buffer_coherence_enum Coherence = amd_buffer_coherence_enum::coherence_default, typename T, typename BufferSizeType, typename X, typename std::enable_if< std::is_same< remove_cvref_t< T >, remove_cvref_t< X > >::value, bool >::type = false>
CK_TILE_HOST_DEVICE constexpr auto make_buffer_view (T *__restrict__ p, BufferSizeType buffer_size, X invalid_element_value)
template<address_space_enum BufferAddressSpace, typename T, typename BufferSizeType, bool InvalidElementUseNumericalZeroValue, amd_buffer_coherence_enum Coherence>
CK_TILE_HOST_DEVICE void print (const buffer_view< BufferAddressSpace, T, BufferSizeType, InvalidElementUseNumericalZeroValue, Coherence > &bv)
template<typename TileWindow_, index_t i_access = -1, bool oob_conditional_check = true>
CK_TILE_DEVICE auto load_tile (const TileWindow_ &tile_window, number< i_access >={}, bool_constant< oob_conditional_check >={})
template<typename TileWindow_, typename ElementWise_, index_t i_access = -1, bool oob_conditional_check = true>
CK_TILE_DEVICE auto load_tile_with_elementwise (const TileWindow_ &tile_window, ElementWise_ elementwise, number< i_access >={}, bool_constant< oob_conditional_check >={})
 Load tile with elementwise function.
template<typename DistributedTensor_, typename TileWindow_, index_t i_access = -1, bool oob_conditional_check = true>
CK_TILE_DEVICE auto load_tile (DistributedTensor_ &dst_tile, const TileWindow_ &tile_window, number< i_access >={}, bool_constant< oob_conditional_check >={})
template<typename T, typename BottomTensorView_, typename WindowLengths_, typename TileDistribution_, index_t NumCoord, index_t i_access = -1, bool oob_conditional_check = true, bool pre_nop = false>
CK_TILE_DEVICE auto load_tile_raw (T &tile, const tile_window_with_static_distribution< BottomTensorView_, WindowLengths_, TileDistribution_, NumCoord > &tile_window, number< i_access >={}, bool_constant< oob_conditional_check >={}, bool_constant< pre_nop >={})
 Loads a tile of data using inline assembly.
template<typename T, typename BottomTensorView_, typename WindowLengths_, typename TileDistribution_, typename LinearBottomDims_, index_t i_access = -1, bool oob_conditional_check = true, bool pre_nop = false>
CK_TILE_DEVICE auto load_tile_raw (T &tile, const tile_window_linear< BottomTensorView_, WindowLengths_, TileDistribution_, LinearBottomDims_ > &tile_window, number< i_access >={}, bool_constant< oob_conditional_check >={}, bool_constant< pre_nop >={})
template<typename LdsTileWindow_, typename TileWindow_, index_t i_access = -1, bool oob_conditional_check = true>
CK_TILE_DEVICE auto async_load_tile (LdsTileWindow_ &&lds_tile, const TileWindow_ &tile_window, number< i_access >={}, bool_constant< oob_conditional_check >={})
template<typename LdsTileWindow_, typename TileWindow_, index_t i_access = -1, bool oob_conditional_check = true, bool pre_nop = false>
CK_TILE_DEVICE auto async_load_tile_raw (LdsTileWindow_ &&lds_tile, const TileWindow_ &tile_window, number< i_access >={}, bool_constant< oob_conditional_check >={}, bool_constant< pre_nop >={})
CK_TILE_DEVICE auto async_load_fence (index_t cnt=0)
template<typename WindowLengths>
CK_TILE_DEVICE auto load_tile (const null_tile_window< WindowLengths > &)
template<typename T, typename WindowLengths>
CK_TILE_DEVICE auto load_tile_raw (T &, const null_tile_window< WindowLengths > &)
constexpr int DS_READ_TR_SIZE ()
template<typename InnerEncode, index_t kLeadIterPerWarp, index_t kSecondIterPerWarp, index_t kLeadNumWarps, index_t kSecondNumWarps>
CK_TILE_HOST_DEVICE constexpr auto InputTileDistributionEncoding ()
template<typename BottomTensorView_, typename WindowLengths_, typename TileDistribution_, index_t NumCoord, typename Policy = DefaultTranspose<typename BottomTensorView_::DataType>, typename = std::enable_if_t<TransposeTileDistrChecker<TileDistribution_, typename BottomTensorView_::DataType, Policy>::distr_encoding_valid, Policy>>
CK_TILE_DEVICE auto load_tile_transpose (const tile_window_with_static_distribution< BottomTensorView_, WindowLengths_, TileDistribution_, NumCoord > &tile_window)
 transpose loads tile from a tensor and returns the resulting tensor with a new (transposed) tile distribution. use SFINAE to ensure the tile distribution encoding is valid.
template<typename T>
CK_TILE_DEVICE constexpr auto is_null_tile_window (const T &)
template<typename WindowLengths>
CK_TILE_DEVICE constexpr auto make_null_tile_window (const WindowLengths &window_lengths)
template<typename WindowLengths, typename... Ts>
CK_TILE_DEVICE constexpr auto make_tile_window (null_tensor_view, const WindowLengths &window_lengths, const multi_index< WindowLengths::size()> &, Ts &&...)
template<typename WindowLengths, typename StaticTileDistribution>
CK_TILE_DEVICE constexpr auto make_tile_window (const null_tile_window< WindowLengths > &t, const StaticTileDistribution &)
template<typename WindowLengths>
CK_TILE_DEVICE void move_tile_window (null_tile_window< WindowLengths > &, const typename null_tile_window< WindowLengths >::BottomTensorIndex &)
template<typename OutTensor, typename InTensor>
CK_TILE_DEVICE void shuffle_tile (OutTensor &out, const InTensor &in)
template<typename BottomTensorView_, typename WindowLengths_, index_t... SliceBegins, index_t... SliceEnds>
CK_TILE_DEVICE constexpr auto get_slice_tile (const tile_window_with_static_lengths< BottomTensorView_, WindowLengths_ > &tile, sequence< SliceBegins... > slice_begins, sequence< SliceEnds... > slice_ends)
template<typename DataType_, typename StaticTileDistribution_, index_t... SliceBegins, index_t... SliceEnds>
CK_TILE_DEVICE constexpr auto get_slice_tile (const static_distributed_tensor< DataType_, StaticTileDistribution_ > &tile, sequence< SliceBegins... > slice_begins, sequence< SliceEnds... > slice_ends)
template<typename DstDataType_, typename DstStaticTileDistribution_, typename SrcDataType_, typename SrcStaticTileDistribution_, index_t... SliceBegins, index_t... SliceEnds>
CK_TILE_DEVICE constexpr auto set_slice_tile (static_distributed_tensor< DstDataType_, DstStaticTileDistribution_ > &dst_tile, const static_distributed_tensor< SrcDataType_, SrcStaticTileDistribution_ > &src_tile, sequence< SliceBegins... > slice_begins, sequence< SliceEnds... > slice_ends)
template<typename DataType, typename StaticTileDistribution>
CK_TILE_HOST_DEVICE constexpr auto make_static_distributed_tensor (const StaticTileDistribution &)
template<typename DataType, typename StaticTileDistribution, typename ThreadBuffer>
CK_TILE_HOST_DEVICE constexpr auto make_static_distributed_tensor (const StaticTileDistribution &, ThreadBuffer &&thread_buffer_)
template<typename StaticTileDistribution, typename DistributedIndices>
CK_TILE_HOST_DEVICE constexpr auto get_x_indices_from_distributed_indices (StaticTileDistribution tile_distribution, DistributedIndices distributed_indices)
template<typename DataType, typename StaticTileDistribution, typename XIndicesPredicate>
CK_TILE_HOST_DEVICE void set_tile_if (static_distributed_tensor< DataType, StaticTileDistribution > &out_tensor, DataType value, XIndicesPredicate predicate)
template<typename YLengths, index_t XUnpacks>
CK_TILE_HOST_DEVICE constexpr auto get_y_unpacks_from_x_unpacks (YLengths, number< XUnpacks >)
template<typename BottomTensorView_, typename WindowLengths_, typename TileDistribution_, typename DataType_>
CK_TILE_DEVICE void store_tile (tile_window_with_static_lengths< BottomTensorView_, WindowLengths_ > &tile_window_tmp, const static_distributed_tensor< DataType_, TileDistribution_ > &dstr_tensor)
template<typename BottomTensorView_, typename WindowLengths_, typename TileDistribution_, typename DataType_>
CK_TILE_DEVICE void store_tile_raw (tile_window_with_static_lengths< BottomTensorView_, WindowLengths_ > &tile_window_tmp, const static_distributed_tensor< DataType_, TileDistribution_ > &dstr_tensor)
template<typename BottomTensorView_, typename WindowLengths_, typename TileDistribution_, index_t NumCoord, typename DataType_>
CK_TILE_DEVICE void store_tile (tile_window_with_static_distribution< BottomTensorView_, WindowLengths_, TileDistribution_, NumCoord > &tile_window, const static_distributed_tensor< DataType_, TileDistribution_ > &dstr_tensor)
template<typename BottomTensorView_, typename WindowLengths_, typename TileDistribution_, index_t NumCoord, typename DataType_>
CK_TILE_DEVICE void store_tile_raw (tile_window_with_static_distribution< BottomTensorView_, WindowLengths_, TileDistribution_, NumCoord > &tile_window, const static_distributed_tensor< DataType_, TileDistribution_ > &dstr_tensor)
template<typename BottomTensorView_, typename WindowLengths_, typename TileDistribution_, typename LinearBottomDims_, typename DataType_>
CK_TILE_DEVICE void store_tile (tile_window_linear< BottomTensorView_, WindowLengths_, TileDistribution_, LinearBottomDims_ > &tile_window, const static_distributed_tensor< DataType_, TileDistribution_ > &dstr_tensor)
template<typename BottomTensorView_, typename WindowLengths_, typename TileDistribution_, typename LinearBottomDims_, typename DataType_>
CK_TILE_DEVICE void store_tile_raw (tile_window_linear< BottomTensorView_, WindowLengths_, TileDistribution_, LinearBottomDims_ > &tile_window, const static_distributed_tensor< DataType_, TileDistribution_ > &dstr_tensor)
template<typename TileDistributedSpan_, typename F>
CK_TILE_DEVICE void sweep_tile_span (TileDistributedSpan_, const F &f)
template<typename TileDistributedSpan_, typename F, typename Unpacks = typename uniform_sequence_gen<TileDistributedSpan_::Impl::size(), 1>::type>
CK_TILE_DEVICE void sweep_tile_uspan (TileDistributedSpan_, const F &f, Unpacks={})
template<typename DistributedTensor, typename F, typename UnpacksPerXDim = typename uniform_sequence_gen<DistributedTensor::get_num_of_dimension(), 1>::type>
CK_TILE_HOST_DEVICE constexpr void sweep_tile (const F &f, UnpacksPerXDim={})
template<typename DistributedTensor, typename F, typename UnpacksPerXDim = typename uniform_sequence_gen<DistributedTensor::get_num_of_dimension(), 1>::type>
CK_TILE_HOST_DEVICE constexpr void sweep_tile (const DistributedTensor &, const F &f, UnpacksPerXDim={})
template<typename T, typename F, typename U = typename uniform_sequence_gen<T::get_num_of_dimension(), 1>::type>
CK_TILE_HOST_DEVICE_EXTERN tile_sweeper (const T &, const F &, U={}) -> tile_sweeper< T, F, U >
template<typename Transforms, typename LowerDimensionOldTopIdss, typename UpperDimensionNewTopIdss>
CK_TILE_HOST_DEVICE constexpr auto make_single_stage_tensor_adaptor (const Transforms &transforms, LowerDimensionOldTopIdss, UpperDimensionNewTopIdss)
template<typename OldTensorAdaptor, typename NewTransforms, typename NewLowerDimensionOldTopIdss, typename NewUpperDimensionNewTopIdss>
CK_TILE_HOST_DEVICE constexpr auto transform_tensor_adaptor (const OldTensorAdaptor &old_tensor_adaptor, const NewTransforms &new_transforms, NewLowerDimensionOldTopIdss, NewUpperDimensionNewTopIdss)
template<typename TensorAdaptor0, typename TensorAdaptor1>
CK_TILE_HOST_DEVICE constexpr auto chain_tensor_adaptors (const TensorAdaptor0 &adaptor0, const TensorAdaptor1 &adaptor1)
template<typename X, typename... Xs, typename std::enable_if< sizeof...(Xs) > = 2, bool, ::type = false>
CK_TILE_HOST_DEVICE constexpr auto chain_tensor_adaptors (const X &x, const Xs &... xs)
template<typename Adaptor, typename TopIndex>
CK_TILE_HOST_DEVICE constexpr auto make_tensor_adaptor_coordinate (const Adaptor &adaptor, const TopIndex &idx_top)
template<bool JudgeDoTransforms = true, typename Adaptor, typename AdaptorCoord, typename TopIndex, typename BottomIndex>
CK_TILE_HOST_DEVICE constexpr void move_tensor_adaptor_coordinate (const Adaptor &adaptor, AdaptorCoord &coord, const TopIndex &idx_diff_top, BottomIndex &idx_diff_bottom)
template<bool JudgeDoTransforms = true, typename Adaptor, typename AdaptorCoord, typename TopIndex>
CK_TILE_HOST_DEVICE constexpr void move_tensor_adaptor_coordinate (const Adaptor &adaptor, AdaptorCoord &coord, const TopIndex &idx_diff_top)
template<typename Adaptor, typename AdaptorCoord>
CK_TILE_HOST_DEVICE constexpr bool adaptor_coordinate_is_valid_assuming_top_index_is_valid (const Adaptor &adaptor, const AdaptorCoord &coord)
template<typename Adaptor, typename AdpatorCoord>
CK_TILE_HOST_DEVICE constexpr bool adaptor_coordinate_is_valid (const Adaptor &adaptor, const AdpatorCoord &coord)
template<typename TensorDesc, typename TopIndex>
CK_TILE_HOST_DEVICE constexpr auto make_tensor_coordinate (const TensorDesc &tensor_desc, const TopIndex &idx_top)
template<bool JudgeDoTransforms = true, typename TensorDesc, typename TensorCoord, typename Index>
CK_TILE_HOST_DEVICE constexpr void move_tensor_coordinate (const TensorDesc &tensor_desc, TensorCoord &coord, const Index &coord_step)
template<typename TensorDesc, typename TensorCoord>
CK_TILE_HOST_DEVICE constexpr bool coordinate_has_valid_offset_assuming_top_index_is_valid (const TensorDesc &tensor_desc, const TensorCoord &coord)
template<typename TensorDesc, typename TensorCoord>
CK_TILE_HOST_DEVICE constexpr bool coordinate_has_valid_offset (const TensorDesc &tensor_desc, const TensorCoord &coord)
template<typename Adaptor, typename ElementSpaceSize>
CK_TILE_HOST_DEVICE constexpr auto make_tensor_descriptor_from_adaptor (const Adaptor &adaptor, const ElementSpaceSize &element_space_size)
template<typename OldTensorDescriptor, typename NewTransforms, typename NewLowerDimensionOldTopIdss, typename NewUpperDimensionNewTopIdss>
CK_TILE_HOST_DEVICE constexpr auto transform_tensor_descriptor (const OldTensorDescriptor &old_tensor_desc, const NewTransforms &new_transforms, NewLowerDimensionOldTopIdss, NewUpperDimensionNewTopIdss)
template<typename... Lengths, typename... Strides, index_t GuaranteedLastDimensionVectorLength = -1, index_t GuaranteedLastDimensionVectorStride = -1, typename std::enable_if< sizeof...(Lengths)==sizeof...(Strides), bool >::type = false>
CK_TILE_HOST_DEVICE constexpr auto make_naive_tensor_descriptor (const tuple< Lengths... > &lengths, const tuple< Strides... > &strides, number< GuaranteedLastDimensionVectorLength >=number<-1 >{}, number< GuaranteedLastDimensionVectorStride >=number<-1 >{})
template<typename... Lengths, typename... Strides, typename offset, index_t GuaranteedLastDimensionVectorLength = -1, index_t GuaranteedLastDimensionVectorStride = -1, typename std::enable_if< sizeof...(Lengths)==sizeof...(Strides), bool >::type = false>
CK_TILE_HOST_DEVICE constexpr auto make_naive_tensor_descriptor_with_offset (const tuple< Lengths... > &lengths, const tuple< Strides... > &strides, const offset &os, number< GuaranteedLastDimensionVectorLength >=number<-1 >{}, number< GuaranteedLastDimensionVectorStride >=number<-1 >{})
template<typename... Lengths, index_t GuaranteedLastDimensionVectorLength = -1>
CK_TILE_HOST_DEVICE constexpr auto make_naive_tensor_descriptor_packed (const tuple< Lengths... > &lengths, number< GuaranteedLastDimensionVectorLength >=number<-1 >{})
template<typename... Lengths, typename... Strides, typename Offset, index_t GuaranteedLastDimensionVectorLength = -1, typename std::enable_if< sizeof...(Lengths)==sizeof...(Strides), bool >::type = false>
CK_TILE_HOST_DEVICE constexpr auto make_naive_tensor_descriptor_packed_with_offset (const tuple< Lengths... > &lengths, const Offset &offset, number< GuaranteedLastDimensionVectorLength >=number<-1 >{})
template<typename... Lengths, typename Align>
CK_TILE_HOST_DEVICE constexpr auto make_naive_tensor_descriptor_aligned (const tuple< Lengths... > &lengths, Align align)
template<address_space_enum BufferAddressSpace = address_space_enum::generic, memory_operation_enum DstInMemOp = memory_operation_enum::set, amd_buffer_coherence_enum Coherence = amd_buffer_coherence_enum::coherence_default, typename DataType, typename... Ts>
CK_TILE_HOST_DEVICE constexpr auto make_tensor_view (DataType *__restrict__ p, const tensor_descriptor< Ts... > &desc)
template<address_space_enum BufferAddressSpace = address_space_enum::generic, memory_operation_enum DstInMemOp = memory_operation_enum::set, amd_buffer_coherence_enum Coherence = amd_buffer_coherence_enum::coherence_default, typename DataType, typename... Lengths, typename... Strides, index_t GuaranteedLastDimensionVectorLength = -1, index_t GuaranteedLastDimensionVectorStride = -1, typename std::enable_if< sizeof...(Lengths)==sizeof...(Strides), bool >::type = false>
CK_TILE_HOST_DEVICE constexpr auto make_naive_tensor_view (DataType *__restrict__ p, const tuple< Lengths... > &lengths, const tuple< Strides... > &strides, number< GuaranteedLastDimensionVectorLength >=number<-1 >{}, number< GuaranteedLastDimensionVectorStride >=number<-1 >{})
template<address_space_enum BufferAddressSpace = address_space_enum::generic, amd_buffer_coherence_enum Coherence = amd_buffer_coherence_enum::coherence_default, typename DataType, typename... Lengths, index_t GuaranteedLastDimensionVectorLength = -1>
CK_TILE_HOST_DEVICE constexpr auto make_naive_tensor_view_packed (DataType *__restrict__ p, const tuple< Lengths... > &lengths, number< GuaranteedLastDimensionVectorLength >=number<-1 >{})
template<typename OldTensorView, typename NewTransforms, typename NewLowerDimensionOldVisibleIdss, typename NewUpperDimensionNewVisibleIdss>
CK_TILE_HOST_DEVICE constexpr auto transform_tensor_view (const OldTensorView &old_tensor_view, const NewTransforms &new_transforms, NewLowerDimensionOldVisibleIdss, NewUpperDimensionNewVisibleIdss)
template<typename TensorView, typename TileLengths, typename DoPads>
CK_TILE_HOST_DEVICE constexpr auto pad_tensor_view (const TensorView &tensor_view, const TileLengths &tile_lengths, DoPads)
template<typename StaticTileDistributionEncoding_>
CK_TILE_HOST_DEVICE constexpr auto make_static_tile_distribution (StaticTileDistributionEncoding_)
template<typename PsYs2XsAdaptor_, typename Ys2DDescriptor_, typename StaticTileDistributionEncoding_, typename TileDistributionDetail_>
CK_TILE_HOST_DEVICE void print (const tile_distribution< PsYs2XsAdaptor_, Ys2DDescriptor_, StaticTileDistributionEncoding_, TileDistributionDetail_ > &distribution)
template<typename RsLengths_, typename HsLengthss_, typename Ps2RHssMajor_, typename Ps2RHssMinor_, typename Ys2RHsMajor_, typename Ys2RHsMinor_>
CK_TILE_HOST_DEVICE void print (const typename tile_distribution_encoding< RsLengths_, HsLengthss_, Ps2RHssMajor_, Ps2RHssMinor_, Ys2RHsMajor_, Ys2RHsMinor_ >::detail &detail_obj)
template<typename RsLengths_, typename HsLengthss_, typename Ps2RHssMajor_, typename Ps2RHssMinor_, typename Ys2RHsMajor_, typename Ys2RHsMinor_>
CK_TILE_HOST_DEVICE void print (const tile_distribution_encoding< RsLengths_, HsLengthss_, Ps2RHssMajor_, Ps2RHssMinor_, Ys2RHsMajor_, Ys2RHsMinor_ > &encoding)
template<typename InOutElementFunc, typename... InOutDstrTensors, typename = std::enable_if_t<std::conjunction_v< std::negation<std::is_same<std::remove_const_t<InOutDstrTensors>, null_tensor>>...>>>
CK_TILE_DEVICE void tile_elementwise_inout (const InOutElementFunc &inout_element_func, InOutDstrTensors &... inout_dstr_tensors)
template<typename InElementFunc, typename... InTensor, typename = std::enable_if_t< std::conjunction_v<std::negation<std::is_same<InTensor, null_tensor>>...>>>
CK_TILE_DEVICE auto tile_elementwise_in (const InElementFunc &in_element_func, const InTensor &... in_dstr_tensors)
template<typename InElementFunc, typename Tuple, size_t... I>
CK_TILE_DEVICE auto tile_elementwise_inout_unpack (const InElementFunc &in_element_func, const Tuple &t, std::index_sequence< I... >)
 Template function that "unpacks" a tuple and applies an element-wise operation.
template<typename InElementFunc, typename Tuple>
CK_TILE_DEVICE auto tile_elementwise_inout_unpack (const InElementFunc &in_element_func, const Tuple &t)
 Template function that "unpacks" a tuple and applies an element-wise operation.
template<typename DstrTensors, typename T>
CK_TILE_DEVICE void set_tile (DstrTensors &dstr_tensor, const T &value)
template<typename T>
CK_TILE_DEVICE void set_tile (null_tensor &, const T &)
template<typename DstrTensors, index_t v, bool skip_subdword_opt = false>
CK_TILE_DEVICE void set_tile (DstrTensors &dstr_tensor, number< v >, bool_constant< skip_subdword_opt >={})
template<index_t v>
CK_TILE_DEVICE void set_tile (null_tensor &, number< v >)
template<typename DstrTensors>
CK_TILE_DEVICE void clear_tile (DstrTensors &dstr_tensor)
template<typename DstType, typename SrcTensor>
CK_TILE_DEVICE auto cast_tile (const SrcTensor &src_tensor)
template<typename InOutElementFunc, typename... MaybeNullTensor, typename = std::enable_if_t< std::disjunction_v<std::is_same<remove_cvref_t<MaybeNullTensor>, null_tensor>...>>>
CK_TILE_DEVICE void tile_elementwise_inout (const InOutElementFunc &, MaybeNullTensor &&...)
template<typename InElementFunc, typename... MaybeNullTensor, typename = std::enable_if_t< std::disjunction_v<std::is_same<remove_cvref_t<MaybeNullTensor>, null_tensor>...>>>
CK_TILE_DEVICE auto tile_elementwise_in (const InElementFunc &, MaybeNullTensor &&...)
template<typename TensorView_, typename WindowLengths_, typename StaticTileDistribution_, typename StaticPageIndexArray_, index_t HsGatherDim = 0, index_t NumCoord = 1>
CK_TILE_DEVICE constexpr auto make_tile_scatter_gather (const TensorView_ &tensor_view, const WindowLengths_ &window_lengths, const multi_index< TensorView_::get_num_of_dimension()> &origin, const StaticTileDistribution_ &tile_distribution, const StaticPageIndexArray_ &page_idx, number< HsGatherDim >={}, number< NumCoord >={})
template<typename TensorView, typename WindowLengths, typename StaticTileDistribution, typename StaticPageIndexArray, index_t HsGatherDim>
CK_TILE_DEVICE constexpr auto make_tile_scatter_gather (const tile_window_with_static_lengths< TensorView, WindowLengths > &tile_window, const multi_index< TensorView::get_num_of_dimension()> &origin, const StaticTileDistribution &tile_distribution, const StaticPageIndexArray &page_idx, number< HsGatherDim >={})
template<typename TensorView, typename WindowLengths, typename StaticTileDistribution, typename StaticPageIndexArray, index_t HsGatherDim>
CK_TILE_DEVICE constexpr auto make_tile_scatter_gather (const tile_window_with_static_lengths< TensorView, WindowLengths > &tile_window, const StaticTileDistribution &tile_distribution, const StaticPageIndexArray &page_idx, number< HsGatherDim >={})
template<typename TensorView_, typename WindowLengths_, typename StaticTileDistribution_, typename StaticPageIndexArray_, typename StaticValidArray_, index_t HsGatherDim = 0, index_t NumCoord = 1>
CK_TILE_DEVICE constexpr auto make_tile_scatter_gather (const TensorView_ &tensor_view, const WindowLengths_ &window_lengths, const multi_index< TensorView_::get_num_of_dimension()> &origin, const StaticTileDistribution_ &tile_distribution, const StaticPageIndexArray_ &page_idx, const StaticValidArray_ &valids, number< HsGatherDim >={}, number< NumCoord >={})
template<typename TensorView, typename WindowLengths, typename StaticTileDistribution, typename StaticPageIndexArray, typename StaticValidArray, index_t HsGatherDim>
CK_TILE_DEVICE constexpr auto make_tile_scatter_gather (const tile_window_with_static_lengths< TensorView, WindowLengths > &tile_window, const multi_index< TensorView::get_num_of_dimension()> &origin, const StaticTileDistribution &tile_distribution, const StaticPageIndexArray &page_idx, const StaticValidArray &valids, number< HsGatherDim >={})
template<typename TensorView, typename WindowLengths, typename StaticTileDistribution, typename StaticPageIndexArray, typename StaticValidArray, index_t HsGatherDim>
CK_TILE_DEVICE constexpr auto make_tile_scatter_gather (const tile_window_with_static_lengths< TensorView, WindowLengths > &tile_window, const StaticTileDistribution &tile_distribution, const StaticPageIndexArray &page_idx, const StaticValidArray &valids, number< HsGatherDim >={})
template<typename NewTensorView_, typename OldTensorView_, typename WindowLengths_, typename StaticTileDistribution_, typename StaticPageIndexArray_, typename StaticValidArray_, index_t HsGatherDim = 0, index_t NumCoord = 1>
CK_TILE_DEVICE auto replace_bottom_tensor_view (const NewTensorView_ &new_tensor_view, const tile_scatter_gather< OldTensorView_, WindowLengths_, StaticTileDistribution_, StaticPageIndexArray_, StaticValidArray_, HsGatherDim, NumCoord > &tile_window)
template<typename TensorView_, typename WindowLengths_, typename StaticTileDistribution_, index_t NumCoord = 1>
CK_TILE_DEVICE constexpr auto make_tile_window (const TensorView_ &tensor_view, const WindowLengths_ &window_lengths, const multi_index< TensorView_::get_num_of_dimension()> &origin, const StaticTileDistribution_ &tile_distribution, number< NumCoord >={})
template<typename TensorView_, typename WindowLengths_, typename StaticTileDistribution_, index_t NumCoord = 1>
CK_TILE_DEVICE auto make_tile_window_raw (const TensorView_ &tensor_view, const WindowLengths_ &window_lengths, const multi_index< TensorView_::get_num_of_dimension()> &origin, const StaticTileDistribution_ &tile_distribution, number< NumCoord >={})
template<typename TensorView_, typename WindowLengths_, typename StaticTileDistribution_, index_t NumCoord>
CK_TILE_DEVICE void move_tile_window (tile_window_with_static_distribution< TensorView_, WindowLengths_, StaticTileDistribution_, NumCoord > &window, const typename tile_window_with_static_distribution< TensorView_, WindowLengths_, StaticTileDistribution_, NumCoord >::BottomTensorIndex &step)
template<typename TensorView_, typename WindowLengths_, typename StaticTileDistribution_, index_t NumCoord>
CK_TILE_DEVICE void move_tile_window (tuple< tile_window_with_static_distribution< TensorView_, WindowLengths_, StaticTileDistribution_, NumCoord > > &window, const typename tile_window_with_static_distribution< TensorView_, WindowLengths_, StaticTileDistribution_, NumCoord >::BottomTensorIndex &step)
template<typename TileWindowWithStaticDistributionType, typename StepType, typename std::enable_if_t< is_detected< is_tuple, TileWindowWithStaticDistributionType >::value > * = nullptr>
CK_TILE_DEVICE void move_tile_window (TileWindowWithStaticDistributionType &window, StepType &step)
template<typename TensorView_, typename WindowLengths_>
CK_TILE_DEVICE constexpr auto make_tile_window (const TensorView_ &tensor_view, const WindowLengths_ &window_lengths, const multi_index< TensorView_::get_num_of_dimension()> &origin)
template<typename TensorView, typename WindowLengths>
CK_TILE_DEVICE constexpr auto make_tile_window (const tile_window_with_static_lengths< TensorView, WindowLengths > &tile_window, const multi_index< TensorView::get_num_of_dimension()> &origin)
template<typename TensorView, typename WindowLengths, typename StaticTileDistribution>
CK_TILE_DEVICE constexpr auto make_tile_window (const tile_window_with_static_lengths< TensorView, WindowLengths > &tile_window, const multi_index< TensorView::get_num_of_dimension()> &origin, const StaticTileDistribution &tile_distribution)
template<typename TensorView, typename WindowLengths, typename StaticTileDistribution>
CK_TILE_DEVICE constexpr auto make_tile_window (const tile_window_with_static_lengths< TensorView, WindowLengths > &tile_window, const StaticTileDistribution &tile_distribution)
template<typename TensorView, typename WindowLengths, typename StaticTileDistribution>
CK_TILE_DEVICE constexpr auto make_tile_window_raw (const tile_window_with_static_lengths< TensorView, WindowLengths > &tile_window, const StaticTileDistribution &tile_distribution)
template<typename TensorView_, typename WindowLengths_>
CK_TILE_DEVICE void move_tile_window (tile_window_with_static_lengths< TensorView_, WindowLengths_ > &window, const typename tile_window_with_static_lengths< TensorView_, WindowLengths_ >::BottomTensorIndex &step)
template<typename NewTensorView_, typename OldTensorView_, typename WindowLengths_, typename StaticTileDistribution_, index_t NumCoord = 1>
CK_TILE_DEVICE auto replace_bottom_tensor_view (const NewTensorView_ &new_tensor_view, const tile_window_with_static_distribution< OldTensorView_, WindowLengths_, StaticTileDistribution_, NumCoord > &tile_window)
template<typename NewTensorView_, typename OldTensorView_, typename WindowLengths_>
CK_TILE_DEVICE auto replace_bottom_tensor_view (const NewTensorView_ &new_tensor_view, const tile_window_with_static_lengths< OldTensorView_, WindowLengths_ > &tile_window)
template<typename TensorView_, typename WindowLengths_, typename StaticTileDistribution_, typename LinearBottomDims_ = default_linear_bottom_dims<TensorView_>>
CK_TILE_DEVICE constexpr auto make_tile_window_linear (const TensorView_ &tensor_view, const WindowLengths_ &window_lengths, const multi_index< TensorView_::get_num_of_dimension()> &origin, const StaticTileDistribution_ &tile_distribution, LinearBottomDims_={})
template<typename TileWindow_, typename StaticTileDistribution_, typename LinearBottomDims_ = default_linear_bottom_dims<typename TileWindow_::BottomTensorView>>
CK_TILE_DEVICE constexpr auto make_tile_window_linear (const TileWindow_ &tile_window, const StaticTileDistribution_ &tile_distribution, LinearBottomDims_={})
template<typename TensorView_, typename WindowLengths_, typename StaticTileDistribution_, typename LinearBottomDims_ = default_linear_bottom_dims<TensorView_>>
CK_TILE_DEVICE auto make_tile_window_linear_raw (const TensorView_ &tensor_view, const WindowLengths_ &window_lengths, const multi_index< TensorView_::get_num_of_dimension()> &origin, const StaticTileDistribution_ &tile_distribution, LinearBottomDims_={})
template<typename TileWindow_, typename StaticTileDistribution_, typename LinearBottomDims_ = default_linear_bottom_dims<typename TileWindow_::BottomTensorView>>
CK_TILE_DEVICE constexpr auto make_tile_window_linear_raw (const TileWindow_ &tile_window, const StaticTileDistribution_ &tile_distribution, LinearBottomDims_={})
template<typename TensorView_, typename WindowLengths_, typename StaticTileDistribution_, typename LinearBottomDims_>
CK_TILE_DEVICE void move_tile_window (tile_window_linear< TensorView_, WindowLengths_, StaticTileDistribution_, LinearBottomDims_ > &window, const typename tile_window_linear< TensorView_, WindowLengths_, StaticTileDistribution_, LinearBottomDims_ >::BottomTensorIndex &step)
template<typename TileWindow_>
CK_TILE_DEVICE void move_tile_window (TileWindow_ &window, const typename TileWindow_::BottomTensorIndex &step)
template<typename LdsTileWindow_>
CK_TILE_DEVICE auto get_async_store_smem_info (LdsTileWindow_ &&lds_tile)
template<typename OutTensor, typename InTensor>
CK_TILE_DEVICE void transpose_tile2d (OutTensor &out, const InTensor &in)
template<typename BottomTensorView_, typename WindowLengths_, typename TileDistribution_, typename DataType_>
CK_TILE_DEVICE void update_tile (tile_window_with_static_lengths< BottomTensorView_, WindowLengths_ > &tile_window_tmp, const static_distributed_tensor< DataType_, TileDistribution_ > &dstr_tensor)
template<typename BottomTensorView_, typename WindowLengths_, typename TileDistribution_, index_t NumCoord, typename DataType_, index_t i_access = -1, bool oob_conditional_check = true>
CK_TILE_DEVICE void update_tile (tile_window_with_static_distribution< BottomTensorView_, WindowLengths_, TileDistribution_, NumCoord > &tile_window, const static_distributed_tensor< DataType_, TileDistribution_ > &dstr_tensor, number< i_access >={}, bool_constant< oob_conditional_check >={})
template<typename BottomTensorView_, typename WindowLengths_, typename TileDistribution_, index_t NumCoord, typename DataType_, index_t i_access = -1, bool oob_conditional_check = true, bool pre_nop = false>
CK_TILE_DEVICE void update_tile_raw (tile_window_with_static_distribution< BottomTensorView_, WindowLengths_, TileDistribution_, NumCoord > &tile_window, const static_distributed_tensor< DataType_, TileDistribution_ > &dstr_tensor, number< i_access >={}, bool_constant< oob_conditional_check >={}, bool_constant< pre_nop >={})
template<typename BottomTensorView_, typename WindowLengths_, typename TileDistribution_, typename LinearBottomDims_, typename DataType_, index_t i_access = -1, bool oob_conditional_check = true, bool pre_nop = false>
CK_TILE_DEVICE auto update_tile_raw (tile_window_linear< BottomTensorView_, WindowLengths_, TileDistribution_, LinearBottomDims_ > &tile_window, const static_distributed_tensor< DataType_, TileDistribution_ > &dstr_tensor, number< i_access >={}, bool_constant< oob_conditional_check >={}, bool_constant< pre_nop >={})
template<typename Y, typename X>
CK_TILE_HOST_DEVICE constexpr Y bit_cast (const X &x)
template<auto... val>
constexpr void CK_PRINT ()
template<typename... type>
constexpr void CK_PRINT ()
template<size_t... Idx>
constexpr std::tuple< std::integral_constant< size_t, Idx >... > makeTuple (std::index_sequence< Idx... >) noexcept
constexpr size_t constexpr_strlen (const char *c)
template<typename... Args>
void CK_TILE_ERROR (Args &&... args) noexcept
template<class EnvVar>
const std::string & EnvGetString (EnvVar)
template<class EnvVar>
bool EnvIsEnabled (EnvVar)
template<class EnvVar>
bool EnvIsDisabled (EnvVar)
template<class EnvVar>
uint64_t EnvValue (EnvVar)
template<class EnvVar>
bool EnvIsUnset (EnvVar)
template<class EnvVar>
void EnvUnset (EnvVar)
template<typename EnvVar, typename ValueType>
void UpdateEnvVar (EnvVar, const ValueType &val)
 Updates the cached value of an environment variable.
template<typename EnvVar>
void UpdateEnvVar (EnvVar, const std::string_view &val)
template<typename F, typename X>
CK_TILE_HOST_DEVICE constexpr auto unpack (F &&f, X &&x)
template<typename F, typename X, typename Y>
CK_TILE_HOST_DEVICE constexpr auto unpack2 (F &&f, X &&x, Y &&y)
template<bool predicate, typename X, typename Y>
constexpr auto conditional_expr (X &&x, Y &&y)
void validate_stride (std::string Layout, int M, int N, int stride, const std::string &stride_name)
void validate_gemm_stride (std::string a_layout, std::string b_layout, std::string c_layout, int M, int N, int K, int Stride_A, int Stride_B, int Stride_C)
template<typename T>
CK_TILE_HOST_DEVICE void print (const T &)
template<>
CK_TILE_HOST_DEVICE void print (const int &value)
 Specialization for int.
template<>
CK_TILE_HOST_DEVICE void print (const float &value)
 Specialization for float.
template<>
CK_TILE_HOST_DEVICE void print (const double &value)
 Specialization for double.
template<>
CK_TILE_HOST_DEVICE void print (const long &value)
 Specialization for long.
template<>
CK_TILE_HOST_DEVICE void print (const unsigned int &value)
 Specialization for unsigned int.
template<>
CK_TILE_HOST_DEVICE void print (const char &value)
 Specialization for char.
template<typename T, size_t N>
CK_TILE_HOST_DEVICE void print (const T(&value)[N])
 Specialization for array.
template<typename PY, typename PX, typename std::enable_if< std::is_pointer_v< PY > &&std::is_pointer_v< PX >, bool >::type = false>
CK_TILE_HOST_DEVICE PY c_style_pointer_cast (PX p_x)
template<typename... Ts>
__host__ __device__ composes (Ts &&...) -> composes< remove_cvref_t< Ts >... >
 FIXME: create macro to replace 'host device' and nothing more.
template<typename ComputeDataType, typename OutDataType, typename AccDataType = ComputeDataType>
CK_TILE_HOST double get_relative_threshold (const int number_of_accumulations=1)
 Calculate relative error threshold for numerical comparisons.
template<typename ComputeDataType, typename OutDataType, typename AccDataType = ComputeDataType>
CK_TILE_HOST double get_absolute_threshold (const double max_possible_num, const int number_of_accumulations=1)
 Calculate absolute error threshold for numerical comparisons.
template<typename T>
std::ostream & operator<< (std::ostream &os, const std::vector< T > &v)
 Stream operator overload for vector output.
template<typename Range, typename RefRange>
CK_TILE_HOST bool check_size_mismatch (const Range &out, const RefRange &ref, const std::string &msg="Error: Incorrect results!")
 Check for size mismatch between output and reference ranges.
CK_TILE_HOST void report_error_stats (int err_count, double max_err, std::size_t total_size)
 Report error statistics for numerical comparisons.
template<typename Range, typename RefRange>
std::enable_if< std::is_same_v< ranges::range_value_t< Range >, ranges::range_value_t< RefRange > > &&std::is_floating_point_v< ranges::range_value_t< Range > > &&!std::is_same_v< ranges::range_value_t< Range >, half_t >, bool >::type CK_TILE_HOST check_err (const Range &out, const RefRange &ref, const std::string &msg="Error: Incorrect results!", double rtol=1e-5, double atol=3e-6, bool allow_infinity_ref=false)
 Check errors between floating point ranges using the specified tolerances.
template<typename Range, typename RefRange>
std::enable_if< std::is_same_v< ranges::range_value_t< Range >, ranges::range_value_t< RefRange > > &&std::is_same_v< ranges::range_value_t< Range >, bf16_t >, bool >::type CK_TILE_HOST check_err (const Range &out, const RefRange &ref, const std::string &msg="Error: Incorrect results!", double rtol=1e-3, double atol=1e-3, bool allow_infinity_ref=false)
 Check errors between floating point ranges using the specified tolerances.
template<typename Range, typename RefRange>
std::enable_if< std::is_same_v< ranges::range_value_t< Range >, ranges::range_value_t< RefRange > > &&std::is_same_v< ranges::range_value_t< Range >, half_t >, bool >::type CK_TILE_HOST check_err (const Range &out, const RefRange &ref, const std::string &msg="Error: Incorrect results!", double rtol=1e-3, double atol=1e-3, bool allow_infinity_ref=false)
 Check errors between half precision floating point ranges.
template<typename Range, typename RefRange>
std::enable_if_t<(std::is_same_v< ranges::range_value_t< Range >, ranges::range_value_t< RefRange > > &&std::is_integral_v< ranges::range_value_t< Range > > &&!std::is_same_v< ranges::range_value_t< Range >, bf16_t >), bool > CK_TILE_HOST check_err (const Range &out, const RefRange &ref, const std::string &msg="Error: Incorrect results!", double=0, double atol=0)
 Check errors between integer ranges.
template<typename Range, typename RefRange>
std::enable_if_t<(std::is_same_v< ranges::range_value_t< Range >, ranges::range_value_t< RefRange > > &&std::is_same_v< ranges::range_value_t< Range >, fp8_t >), bool > CK_TILE_HOST check_err (const Range &out, const RefRange &ref, const std::string &msg="Error: Incorrect results!", unsigned max_rounding_point_distance=1, double atol=1e-1, bool allow_infinity_ref=false)
 Check errors between FP8 ranges.
template<typename Range, typename RefRange>
std::enable_if_t<(std::is_same_v< ranges::range_value_t< Range >, ranges::range_value_t< RefRange > > &&std::is_same_v< ranges::range_value_t< Range >, bf8_t >), bool > CK_TILE_HOST check_err (const Range &out, const RefRange &ref, const std::string &msg="Error: Incorrect results!", double rtol=1e-3, double atol=1e-3, bool allow_infinity_ref=false)
 Check errors between BF8 ranges.
template<typename Range, typename RefRange>
std::enable_if_t<(std::is_same_v< ranges::range_value_t< Range >, ranges::range_value_t< RefRange > > &&std::is_same_v< ranges::range_value_t< Range >, pk_fp4_t >), bool > CK_TILE_HOST check_err (const Range &out, const RefRange &ref, const std::string &msg="Error: Incorrect results!", double=0, double=0)
 Check errors between pk_fp4_t ranges.
template<typename... Ts>
auto concat (const Ts &... xs) -> std::enable_if_t<!AllConvertibleToStringView< Ts... >, std::string >
template<std::size_t N>
constexpr std::size_t getSize (char(&)[N]) noexcept
template<std::size_t N>
constexpr std::size_t getSize (const char(&)[N]) noexcept
constexpr std::size_t getSize (const char *s) noexcept
constexpr std::size_t getSize (const char &) noexcept
std::size_t getSize (const std::string &s) noexcept
constexpr std::size_t getSize (const std::string_view &s) noexcept
template<typename... Ts>
auto concatInto (std::string &result, const Ts &... xs) -> std::enable_if_t< AllConvertibleToStringView< Ts... >, void >
template<typename... Ts>
auto concat (const Ts &... xs) -> std::enable_if_t< AllConvertibleToStringView< Ts... >, std::string >
template<typename Sep, typename First, typename... Rest>
auto concat (Sep sep, const First &first, const Rest &... rest) -> std::enable_if_t< AllConvertibleToStringView< First, Rest... >, std::string >
template<typename Sep, typename First, typename... Rest>
auto concat (Sep sep, const First &first, const Rest &... rest) -> std::enable_if_t<!AllConvertibleToStringView< First, Rest... >, std::string >
template<typename T>
__global__ void set_buffer_value (T *p, T x, uint64_t buffer_element_size)
constexpr unsigned int fnv1a_hash (std::string_view str, unsigned int h=2166136261u)
std::string get_device_name ()
bool is_gfx11_supported ()
bool is_gfx12_supported ()
bool is_load_tr_supported ()
CK_TILE_HOST void hip_check_error (hipError_t x)
template<typename Range>
CK_TILE_HOST std::ostream & LogRange (std::ostream &os, Range &&range, std::string delim, int precision=std::cout.precision(), int width=0)
template<typename T, typename Range>
CK_TILE_HOST std::ostream & LogRangeAsType (std::ostream &os, Range &&range, std::string delim, int precision=std::cout.precision(), int width=0)
template<typename F, typename T, std::size_t... Is>
CK_TILE_HOST auto call_f_unpack_args_impl (F f, T args, std::index_sequence< Is... >)
template<typename F, typename T>
CK_TILE_HOST auto call_f_unpack_args (F f, T args)
template<typename F, typename T, std::size_t... Is>
CK_TILE_HOST auto construct_f_unpack_args_impl (T args, std::index_sequence< Is... >)
template<typename F, typename T>
CK_TILE_HOST auto construct_f_unpack_args (F, T args)
template<typename New2Old>
CK_TILE_HOST HostTensorDescriptor transpose_host_tensor_descriptor_given_new2old (const HostTensorDescriptor &a, const New2Old &new2old)
template<typename F, typename... Xs>
CK_TILE_HOST auto make_ParallelTensorFunctor (F f, Xs... xs)
template<bool is_row_major>
auto host_tensor_descriptor (std::size_t row, std::size_t col, std::size_t stride, bool_constant< is_row_major >)
 Creates a host tensor descriptor with specified dimensions and layout.
template<bool is_row_major>
auto get_default_stride (std::size_t row, std::size_t col, std::size_t stride, bool_constant< is_row_major >)
template<int MinBlockPerCu, typename Kernel, typename... Args>
__global__ void kentry (Args... args)
template<typename Arch, int MinBlockPerCu, typename Kernel, typename... Args>
__global__ void kentry (Args... args)
template<int MinBlockPerCu = CK_TILE_MIN_BLOCK_PER_CU, typename Arch = void, typename KernelImpl, typename... Args>
CK_TILE_HOST auto make_kernel (KernelImpl, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, Args... args)
template<typename... Callables>
CK_TILE_HOST void launch_and_check (const stream_config &sc, Callables &&... callables)
template<typename TimerType, typename PreprocessFunc>
CK_TILE_HOST double preprocess_profiling_impl (TimerType timer, const stream_config &s, PreprocessFunc preprocess)
template<typename TimerType, typename CallablesFunc, typename PreprocessFunc = std::nullptr_t>
CK_TILE_HOST double timing_loop_impl (TimerType timer, const stream_config &s, CallablesFunc &&callables_func, PreprocessFunc preprocess=nullptr)
template<typename... Callables>
CK_TILE_HOST float launch_kernel (const stream_config &s, Callables &&... callables)
template<typename PreprocessFunc, typename... Callables>
CK_TILE_HOST float launch_kernel_time_mask (const stream_config &s, PreprocessFunc preprocess, Callables &&... callables)
template<typename Tensor>
void permute_vectors_i4x4_b (Tensor &tensor)
 Permute packed int4 vectors for device implementation compatibility.
template<typename ADataType, typename BDataType, typename DDataType, typename EDataType, typename AccDataType, typename CDEElementWise>
void calculate_reference_flat_indexing (const ck_tile::HostTensor< ADataType > &a_full_dims, const ck_tile::HostTensor< BDataType > &b_full_dims, const std::vector< ck_tile::HostTensor< DDataType > > &ds_full_dims_host, ck_tile::HostTensor< EDataType > &e_full_dims_host_ref, ck_tile::index_t G_total, ck_tile::index_t M_total, ck_tile::index_t N_total, ck_tile::index_t K_total, const CDEElementWise &cde_elementwise)
template<typename ADataType, typename BDataType, typename DDataType, typename EDataType, typename AccDataType, typename CDEElementWise>
void calculate_reference_multi_dimensional (const HostTensor< ADataType > &a_full_dims, const HostTensor< BDataType > &b_full_dims, const std::vector< HostTensor< DDataType > > &ds_full_dims_host, HostTensor< EDataType > &e_full_dims_host_ref, const std::vector< index_t > &G_dims, const std::vector< index_t > &M_dims, const std::vector< index_t > &N_dims, const std::vector< index_t > &K_dims, const std::vector< index_t > &A_dims, const std::vector< index_t > &B_dims, const std::vector< index_t > &E_dims, const CDEElementWise &cde_elementwise)
template<typename DataType, typename RandValOutputDataType>
CK_TILE_HOST void reference_batched_dropout (HostTensor< DataType > &in_out_b_m_n, const HostTensor< RandValOutputDataType > &randval_b_m_n, const uint8_t &p_undrop_in_uint8_t, const float scale)
template<typename RandValOutputDataType>
CK_TILE_HOST void reference_batched_dropout_randval (HostTensor< RandValOutputDataType > &randval_b_m_n, index_t batch, uint64_t drop_seed, uint64_t drop_offset)
template<typename ADataType, typename BDataType, typename AccDataType, typename CDataType, typename AElementOp = ck_tile::identity, typename BElementOp = ck_tile::identity, typename BinaryElementOp = ck_tile::plus<AccDataType>>
CK_TILE_HOST void reference_batched_elementwise (const HostTensor< ADataType > &a_b_m_n, const HostTensor< BDataType > &b_b_m_n, HostTensor< CDataType > &c_b_m_n, const AElementOp &a_element_op={}, const BElementOp &b_element_op={}, const BinaryElementOp &binary_element_op={})
template<typename ADataType, typename BDataType, typename AccDataType, typename CDataType, typename AElementOp = ck_tile::identity, typename BElementOp = ck_tile::identity, typename ACCElementOp = ck_tile::identity>
CK_TILE_HOST void reference_batched_gemm (const HostTensor< ADataType > &a_b_m_k, const HostTensor< BDataType > &b_b_n_k, HostTensor< CDataType > &c_b_m_n, const AElementOp &a_element_op={}, const BElementOp &b_element_op={}, const ACCElementOp &acc_element_op={})
template<typename CDataType, typename MaskingType>
CK_TILE_HOST void reference_batched_masking (HostTensor< CDataType > &c_b_m_n, const MaskingType &mask)
template<typename DataType, typename ComputeDataType = float>
CK_TILE_HOST void reference_batched_rotary_position_embedding (const HostTensor< DataType > &input_bsd, const HostTensor< DataType > &cos_sd, const HostTensor< DataType > &sin_sd, bool interleaved, HostTensor< DataType > &output_bsd, bool use_1_row_sin_cos=false)
template<typename ADataType, typename CompDataType, typename BDataType, typename CompElementOp = ck_tile::identity>
CK_TILE_HOST void reference_batched_softmax (const HostTensor< ADataType > &a_b_m_n, HostTensor< BDataType > &b_b_m_n, const CompElementOp &comp_element_op={}, std::optional< std::reference_wrapper< HostTensor< CompDataType > > > lse_b_m=std::nullopt)
template<typename Type>
CK_TILE_HOST void reference_batched_transpose (const HostTensor< Type > &x, HostTensor< Type > &y, std::string layout_in="NCHW", std::string layout_out="NHWC")
template<typename ADataType, typename BDataType, typename ComputeDataType, typename ElementOp>
CK_TILE_HOST void reference_unary_elementwise (const HostTensor< ADataType > &a, HostTensor< BDataType > &b, ElementOp element_op)
template<typename ADataType, typename BDataType, typename CDataType, typename ComputeDataType, typename ElementOp>
CK_TILE_HOST void reference_binary_elementwise (const HostTensor< ADataType > &a, const HostTensor< BDataType > &b, HostTensor< CDataType > &c, ElementOp element_op)
template<typename AccDataType, typename Activation, typename ADataType, typename GDataType, typename DDataType, typename ODataType, typename AScaleDataType, typename GScaleDataType, typename DScaleDataType, typename YSmoothScaleDataType, typename TopkWeightDataType, typename IndexDataType>
void reference_fused_moe (const ck_tile::HostTensor< ADataType > &a_host, const ck_tile::HostTensor< GDataType > &g_host, const ck_tile::HostTensor< DDataType > &d_host, const ck_tile::HostTensor< AScaleDataType > &sa_host, const ck_tile::HostTensor< GScaleDataType > &sg_host, const ck_tile::HostTensor< DScaleDataType > &sd_host, const ck_tile::HostTensor< YSmoothScaleDataType > &sy_host, ck_tile::HostTensor< ODataType > &o_host, const ck_tile::HostTensor< IndexDataType > &sorted_token_ids_host, const ck_tile::HostTensor< TopkWeightDataType > &sorted_weight_host, const ck_tile::HostTensor< IndexDataType > &sorted_expert_ids_host, const ck_tile::HostTensor< IndexDataType > &num_sorted_tiles_host, const ck_tile::HostTensor< IndexDataType > &token_ids_host, ck_tile::index_t block_m, ck_tile::index_t tokens, ck_tile::index_t experts, ck_tile::index_t hidden_size, ck_tile::index_t intermediate_size, ck_tile::index_t topk, ck_tile::index_t gate_only)
template<typename ADataType, typename QDataType, typename BDataType, typename AccDataType, typename CDataType, typename QuantGroupSize, bool aquant, typename AElementOp = ck_tile::identity, typename BElementOp = ck_tile::identity, typename ACCElementOp = ck_tile::identity>
CK_TILE_HOST void reference_gemm_quant (const HostTensor< ADataType > &a_m_k, const HostTensor< QDataType > &q, const HostTensor< BDataType > &b_k_n, HostTensor< CDataType > &c_m_n, const AElementOp &a_element_op={}, const BElementOp &b_element_op={}, const ACCElementOp &acc_element_op={})
template<typename ADataType, typename AQDataType, typename BDataType, typename BQDataType, typename AccDataType, typename CDataType, typename AElementOp = ck_tile::identity, typename BElementOp = ck_tile::identity, typename ACCElementOp = ck_tile::identity>
CK_TILE_HOST void reference_gemm_rowcol_quant (const HostTensor< ADataType > &a_m_k, const HostTensor< AQDataType > &aq_m_1, const HostTensor< BDataType > &b_k_n, const HostTensor< BQDataType > &bq_1_n, HostTensor< CDataType > &c_m_n, const AElementOp &a_element_op={}, const BElementOp &b_element_op={}, const ACCElementOp &acc_element_op={})
template<typename ADataType, typename AQDataType, typename BDataType, typename BQDataType, typename AccDataType, typename CDataType, typename AElementOp = ck_tile::identity, typename BElementOp = ck_tile::identity, typename ACCElementOp = ck_tile::identity>
CK_TILE_HOST void reference_gemm_tensor_quant (const HostTensor< ADataType > &a_m_k, const HostTensor< AQDataType > &aq_1_1, const HostTensor< BDataType > &b_k_n, const HostTensor< BQDataType > &bq_1_1, HostTensor< CDataType > &c_m_n, const AElementOp &a_element_op={}, const BElementOp &b_element_op={}, const ACCElementOp &acc_element_op={})
template<typename ADataType, typename BDataType, typename AccDataType, typename CDataType, typename AElementOp = ck_tile::identity, typename BElementOp = ck_tile::identity, typename ACCElementOp = ck_tile::identity>
CK_TILE_HOST void reference_gemm (const HostTensor< ADataType > &a_m_k, const HostTensor< BDataType > &b_k_n, HostTensor< CDataType > &c_m_n, const AElementOp &a_element_op={}, const BElementOp &b_element_op={}, const ACCElementOp &acc_element_op={})
template<typename AsDataType, typename BsDataType, typename DsDataType, typename AccDataType, typename CDataType, typename AElementOp, typename BElementOp, typename CDElementOp, typename ADataType = remove_cvref_t<std::tuple_element_t<0, AsDataType>>, typename BDataType = remove_cvref_t<std::tuple_element_t<0, BsDataType>>, typename DDataType = remove_cvref_t<std::tuple_element_t<0, DsDataType>>>
CK_TILE_HOST void reference_gemm_multiple_abd (const std::array< HostTensor< ADataType >, AsDataType::size()> &as_m_k, const std::array< HostTensor< BDataType >, BsDataType::size()> &bs_k_n, const std::array< HostTensor< DDataType >, DsDataType::size()> &ds_m_n, HostTensor< ADataType > &a_m_k, HostTensor< BDataType > &b_k_n, HostTensor< CDataType > &c_m_n, const AElementOp &a_element_op={}, const BElementOp &b_element_op={}, const CDElementOp &acc_element_op={})
template<typename ADataType, typename BDataType, typename ScaleDataType, typename AccDataType, typename CDataType, typename AElementOp = ck_tile::identity, typename BElementOp = ck_tile::identity, typename ACCElementOp = ck_tile::identity>
CK_TILE_HOST void reference_mx_gemm (const HostTensor< ADataType > &a_m_k, const HostTensor< BDataType > &b_k_n, HostTensor< CDataType > &c_m_n, const HostTensor< ScaleDataType > &scale_a, const HostTensor< ScaleDataType > &scale_b, const AElementOp &={}, const BElementOp &={}, const ACCElementOp &={})
template<typename ADataType, typename BDataType, typename DsDataType, typename AccDataType, typename CDataType, typename ACCElementOp, typename DDataType = remove_cvref_t<std::tuple_element_t<0, DsDataType>>>
CK_TILE_HOST void reference_gemm_multiple_d (const HostTensor< ADataType > &a_m_k, const HostTensor< BDataType > &b_k_n, const std::array< HostTensor< DDataType >, DsDataType::size()> &ds_m_n, HostTensor< CDataType > &c_m_n, const ACCElementOp &acc_element_op={})
template<typename ADataType, typename BDataType, typename AccDataType, typename CDataType, typename LayoutA, typename LayoutB, typename LayoutC>
__global__ void naive_gemm_kernel (ADataType *A, BDataType *B, CDataType *C, ck_tile::index_t M, ck_tile::index_t N, ck_tile::index_t K, ck_tile::index_t strideA, ck_tile::index_t strideB, ck_tile::index_t strideC)
template<typename ADataType, typename BDataType, typename AccDataType, typename CDataType, typename LayoutA, typename LayoutB, typename LayoutC>
__global__ void blockwise_gemm_kernel (ADataType *A, BDataType *B, CDataType *C, ck_tile::index_t M, ck_tile::index_t N, ck_tile::index_t K, ck_tile::index_t strideA, ck_tile::index_t strideB, ck_tile::index_t strideC, ck_tile::index_t scale_granularity_m, ck_tile::index_t scale_granularity_n, ck_tile::index_t scale_granularity_k, float *scale_A_ptr, float *scale_B_ptr)
template<typename ADataType, typename BDataType, typename AccDataType, typename CDataType, typename LayoutA, typename LayoutB, typename LayoutC>
void reference_gemm_gpu (ADataType *a_ptr, BDataType *b_ptr, CDataType *c_ptr, index_t M, index_t N, index_t K, index_t stride_a, index_t stride_b, index_t stride_c)
template<typename ADataType, typename BDataType, typename AccDataType, typename CDataType, typename LayoutA, typename LayoutB, typename LayoutC>
void reference_blockwise_gemm_gpu (ADataType *a_ptr, BDataType *b_ptr, CDataType *c_ptr, index_t M, index_t N, index_t K, index_t stride_a, index_t stride_b, index_t stride_c, index_t scale_granularity_m, index_t scale_granularity_n, index_t scale_granularity_k, float *scale_A_ptr, float *scale_B_ptr)
template<typename ADataType, typename BDataType, typename AccDataType, typename CDataType, typename LayoutA, typename LayoutB, typename LayoutC>
void reference_batched_gemm_gpu (ADataType *a_ptr, BDataType *b_ptr, CDataType *c_ptr, index_t M, index_t N, index_t K, index_t stride_a, index_t stride_b, index_t stride_c, index_t batch_stride_A, index_t batch_stride_B, index_t batch_stride_C, index_t batch_count)
template<ck_tile::index_t NDimSpatial, typename InDataType, typename WeiDataType, typename OutDataType>
CK_TILE_HOST void reference_grouped_conv_bwd_data (HostTensor< InDataType > &input, const HostTensor< WeiDataType > &weight, const HostTensor< OutDataType > &output, std::vector< ck_tile::long_index_t > conv_strides, std::vector< ck_tile::long_index_t > conv_dilations, std::vector< ck_tile::long_index_t > in_left_pads, std::vector< ck_tile::long_index_t >)
template<ck_tile::index_t NDimSpatial, typename InDataType, typename WeiDataType, typename OutDataType>
CK_TILE_HOST void reference_grouped_conv_bwd_weight (const HostTensor< InDataType > &input, HostTensor< WeiDataType > &weight, const HostTensor< OutDataType > &output, std::vector< ck_tile::long_index_t > conv_strides, std::vector< ck_tile::long_index_t > conv_dilations, std::vector< ck_tile::long_index_t > in_left_pads, std::vector< ck_tile::long_index_t >)
template<ck_tile::index_t NDimSpatial, typename InDataType, typename WeiDataType, typename OutDataType, typename Elfunc = ck_tile::element_wise::PassThrough, typename Tuple = ck_tile::tuple<>>
CK_TILE_HOST void reference_grouped_conv_fwd (const HostTensor< InDataType > &input, const HostTensor< WeiDataType > &weight, HostTensor< OutDataType > &output, std::vector< ck_tile::long_index_t > conv_strides, std::vector< ck_tile::long_index_t > conv_dilations, std::vector< ck_tile::long_index_t > in_left_pads, std::vector< ck_tile::long_index_t >, Elfunc elfunc=Elfunc{}, Tuple ds={})
template<typename InDataType, typename OutDataType, index_t NDimSpatial>
CK_TILE_HOST void reference_im2col (const HostTensor< InDataType > &in_host, HostTensor< OutDataType > &out_host, const ck_tile::conv::ConvParam &conv_params)
template<typename XDataType, typename GammaDataType, typename BetaDataType, typename ComputeDataType, typename YDataType, typename MeanDataType, typename InvStdDataType, typename Epilogue = reference_layernorm2d_default_epilogue>
void reference_layernorm2d_fwd (const HostTensor< XDataType > &x_m_n, const HostTensor< GammaDataType > &gamma_n, const HostTensor< BetaDataType > &beta_n, HostTensor< YDataType > &y_m_n, HostTensor< MeanDataType > &mean_m, HostTensor< InvStdDataType > &invStd_m, ComputeDataType epsilon, Epilogue epilogue_functor={})
template<typename ADataType, typename BDataType, typename AccDataType, typename CDataType, typename LayoutA, typename LayoutB, typename LayoutC, int MoeGemmKind = 0, typename ActivationOp = identity>
__global__ void moe_gemm_kernel (const ck_tile::index_t *p_sorted_token_ids_, const ck_tile::index_t *p_sorted_expert_ids_, const ck_tile::index_t *p_max_token_id_, const ADataType *A, const BDataType *B, CDataType *C, const AccDataType *expert_weight_ptr, ck_tile::index_t Num_tokens, ck_tile::index_t TokensPerBlock, ck_tile::index_t TopK, ck_tile::index_t M, ck_tile::index_t N, ck_tile::index_t K, ck_tile::index_t strideA, ck_tile::index_t strideB, ck_tile::index_t strideC, index_t scale_granularity_m, index_t scale_granularity_n, index_t scale_granularity_k, float *scale_A_ptr, float *scale_B_ptr, float *expert_bias_ptr)
template<typename ADataType, typename BDataType, typename AccDataType, typename CDataType, typename LayoutA, typename LayoutB, typename LayoutC, int MoeGemmKind = 0, typename ActivationOp = identity>
void reference_moe_gemm_gpu (const index_t *p_sorted_token_ids_, const index_t *p_sorted_expert_ids_, const index_t *p_max_token_id_, const ADataType *a_ptr, const BDataType *b_ptr, CDataType *c_ptr, const AccDataType *expert_weight_ptr, index_t Num_tokens, index_t TokensPerBlock, index_t TopK, index_t M, index_t N, index_t K, index_t stride_a, index_t stride_b, index_t stride_c, index_t scale_granularity_m, index_t scale_granularity_n, index_t scale_granularity_k, float *scale_A_ptr, float *scale_B_ptr, float *exp_bias=nullptr)
template<typename WeightType, typename IndexType = index_t>
CK_TILE_HOST void reference_moe_sorting (const HostTensor< IndexType > &topk_ids, const HostTensor< WeightType > &weights, const HostTensor< IndexType > &local_expert_mask, HostTensor< IndexType > &p_sorted_token_ids, HostTensor< WeightType > &sorted_weight, HostTensor< IndexType > &sorted_expert_ids, index_t &unit_cnt, const index_t experts, const index_t unit_size, const index_t tokens, bool local_expert_masking, bool skip_experts_with_zero_token=true)
template<typename DataType>
CK_TILE_HOST void reference_permute (const HostTensor< DataType > &x, HostTensor< DataType > &y, std::vector< index_t > perm)
template<typename DataType>
CK_TILE_HOST auto reference_permute (const HostTensor< DataType > &x, std::vector< index_t > perm)
template<typename InDataType, typename ComputeDataType, typename OutDataType, typename IndexDataType, typename ReduceOp, typename TensorShape, typename WindowShape, bool OutputIndex = false>
CK_TILE_HOST void reference_pool2d (const HostTensor< InDataType > &input, HostTensor< OutDataType > &output, HostTensor< IndexDataType > &output_index, PoolKernelArgs< TensorShape, WindowShape > kargs, ReduceOp reduce_op)
template<typename InDataType, typename ComputeDataType, typename OutDataType, typename IndexDataType, typename ReduceOp, typename TensorShape, typename WindowShape, bool OutputIndex = false>
CK_TILE_HOST void reference_pool3d (const HostTensor< InDataType > &input, HostTensor< OutDataType > &output, HostTensor< IndexDataType > &output_index, PoolKernelArgs< TensorShape, WindowShape > kargs, ReduceOp reduce_op)
template<typename XDataType, typename ComputeDataType, typename YDataType, typename ReduceOp>
CK_TILE_HOST void reference_reduce (const HostTensor< XDataType > &x_m_n, HostTensor< YDataType > &y_m, ReduceOp reduce_op)
template<typename XDataType, typename ComputeDataType, typename YDataType, typename ReduceOp, typename KeptDim, typename ReduceDims>
CK_TILE_HOST void reference_reduce (const HostTensor< XDataType > &x_tensor, HostTensor< YDataType > &y_tensor, ReduceOp reduce_op, KeptDim kept_dim, ReduceDims reduce_dims)
template<typename XDataType, typename GammaDataType, typename ComputeDataType, typename YDataType, typename InvRmsDataType, typename UnquantYDataType, typename Epilogue = reference_rmsnorm2d_default_epilogue>
void reference_rmsnorm2d_fwd (const HostTensor< XDataType > &x_m_n, const HostTensor< GammaDataType > &gamma_n, HostTensor< YDataType > &y_m_n, HostTensor< InvRmsDataType > &invRms_m, HostTensor< UnquantYDataType > &unquant_y_m_n, ComputeDataType epsilon, Epilogue epilogue_functor={}, const int use_model_sensitive_rmsnorm=static_cast< int >(Rmsnorm2dSensitiveEnum::NO_SPECIFIC_MODEL))
template<typename XDataType, typename ScaleDataType, typename QXDataType>
CK_TILE_HOST void reference_rowwise_quantization2d (const HostTensor< XDataType > &x_m_n, const HostTensor< ScaleDataType > &scale_m, HostTensor< QXDataType > &qx_m_n)
template<typename InputType, typename ComputeType, typename OutputType = ComputeType>
CK_TILE_HOST void reference_softmax (const HostTensor< InputType > &x, HostTensor< OutputType > &y, index_t dim=-1)
template<typename InputType, typename ComputeType, typename OutputType = ComputeType>
CK_TILE_HOST auto reference_softmax (const HostTensor< InputType > &x, index_t dim=-1)
template<typename DataType, typename IndexType = index_t>
CK_TILE_HOST void reference_topk (const HostTensor< DataType > &x, HostTensor< DataType > &y_values, HostTensor< IndexType > &y_indices, index_t k, index_t dim=-1, bool largest=true, bool sorted=true)
template<typename DataType, typename IndexType = index_t>
CK_TILE_HOST auto reference_topk (const HostTensor< DataType > &x, index_t k, index_t dim=-1, bool largest=true, bool sorted=true)
template<typename ADataType, typename BDataType>
void reference_transpose_elementwise (const HostTensor< ADataType > &a, HostTensor< BDataType > &b)
void flush_icache ()
template<typename T>
auto shuffle_aq (const ck_tile::HostTensor< T > *t, int block_aq_k)
template<typename GemmConfig, typename T>
auto shuffle_b (const ck_tile::HostTensor< T > &t)
template<typename GemmConfig, typename T>
auto shuffle_bq_permuteN (const ck_tile::HostTensor< T > &t)
template<typename GemmConfig, typename T>
auto shuffle_b_permuteN (const ck_tile::HostTensor< T > &t)
template<typename BDataType, typename ComputeDataType, index_t UnaryOpSize, typename WarpTile, typename WarpWindow>
CK_TILE_DEVICE void load_int4_tile (WarpTile &dst, const WarpWindow &src)
template<ck_tile::StreamKReductionStrategy ReductionStrategy>
ck_tile::index_t estimate_num_wgs_per_tile (index_t sk_ctas, index_t iters_per_sk_cta, index_t iters_per_tile)
 Estimates the number of Stream-K workgroups per macro tile in the C tensor.
template<typename ADataType_, typename BDataType_>
std::string gemm_prec_str ()
template<memory_operation_enum MemOp_>
std::string mem_op_string ()
CK_TILE_HOST_DEVICE constexpr auto make_generic_attention_mask_coordinates_from_lr_window (index_t left_size, index_t right_size, index_t y_total, index_t x_total, bool is_top_left=true)
template<typename MaskType>
CK_TILE_HOST_DEVICE constexpr auto make_generic_attention_mask_from_lr_window (index_t left_size, index_t right_size, index_t y_total, index_t x_total, bool is_top_left=true)
template<typename DataType, bool RowMajor = true, unsigned LogMaxSadOprndSize = 16>
CK_TILE_HOST_DEVICE auto make_alibi_from_lr_mask (DataType slope, index_t window_left_size, index_t window_right_size, index_t y_total, index_t x_total, GenericAttentionMaskEnum mask_enum)
template<typename DataType>
CK_TILE_HOST std::vector< DataType > get_alibi_slopes (ck_tile::index_t nheads)
template<typename TensorView>
CK_TILE_HOST_DEVICE auto make_page_block_navigator (const TensorView &tensor_view)
template<typename DataType, index_t VirtualDim, typename TensorView>
CK_TILE_HOST_DEVICE auto make_page_block_navigator (copy_const_t< DataType, void > *physical_blocks, long_index_t block_stride, long_index_t fixed_offset, const int32_t *physical_block_indices, index_t num_blocks, index_t page_block_size, const TensorView &complete_view, const TensorView &last_view)
CK_TILE_HOST constexpr auto moe_sorting_get_smem_row_col (int tokens_, int num_experts_)
CK_TILE_HOST index_t moe_sorting_get_sub_token (int tokens_, int num_experts_)
CK_TILE_HOST bool moe_sorting_is_oneshot (int tokens_, int num_experts_)
CK_TILE_HOST index_t moe_sorting_mp_get_workspace_size (int tokens_, int num_experts_, int topk_)
CK_TILE_HOST index_t moe_sorting_get_workspace_size (int tokens_, int num_experts_, int topk_, int dispatch_policy_)
template<typename ADataType, typename BDataType, typename AccDataType, index_t M_Warp_Tile, index_t N_Warp_Tile, index_t K_Warp_Tile>
CK_TILE_HOST bool check_wmma_supported ()
std::string quant_type_to_string (QuantType quant_type)
CK_TILE_HOST std::string getConvSpecializationString (const ConvolutionSpecialization &s)
template<typename TilePartitioner>
CK_TILE_HOST SplitImagePieceInfo calculate_spatial_piece (ck_tile::index_t piece_idx, ck_tile::index_t num_d_pieces, ck_tile::index_t num_h_pieces, ck_tile::index_t num_w_pieces, ck_tile::index_t base_piece_d, ck_tile::index_t base_piece_h, ck_tile::index_t base_piece_w, ck_tile::index_t total_d, ck_tile::index_t total_h, ck_tile::index_t total_w, ck_tile::index_t N, ck_tile::index_t K, ck_tile::index_t total_blocks)
 Calculate piece information for split-image convolution.
template<typename BlockShape>
CK_TILE_DEVICE constexpr index_t block_tile_welford_calculate_max_count (int row_size)
template<typename VarDistributedTensor_, bool FastFdiv_ = false>
CK_TILE_DEVICE constexpr void block_tile_welford_post_scale_var (VarDistributedTensor_ &var_tensor, int count, bool_constant< FastFdiv_ >={})
template<typename T, bool kFastFDiv = false>
CK_TILE_DEVICE void welford_update (T &mean, T &var, T x, int count, bool_constant< kFastFDiv >={})
template<typename AccDistributedTensor_, typename ReduceFunc, bool WithBroadcast = true, bool CrossWarp = true>
CK_TILE_DEVICE void block_tile_reduce_sync (AccDistributedTensor_ &acc_tensor, const ReduceFunc &reduce_func, bool_constant< WithBroadcast >={}, bool_constant< CrossWarp >={})
template<typename AccDistributedTensor_, typename ReduceFunc>
CK_TILE_DEVICE void block_tile_reduce_xor_sync (AccDistributedTensor_ &acc_tensor, const ReduceFunc &reduce_func)
template<typename AccDistributedTensor_, typename InDistributedTensor_, index_t... InReduceDims, typename ReduceFunc>
CK_TILE_DEVICE void block_tile_reduce (AccDistributedTensor_ &acc_tensor, const InDistributedTensor_ &in_tensor, sequence< InReduceDims... >, const ReduceFunc &reduce_func)
template<typename AccDataType_, typename InDistributedTensor_, index_t... InReduceDims, typename ReduceFunc, typename InDataType_>
CK_TILE_DEVICE auto block_tile_reduce (const InDistributedTensor_ &in_tensor, sequence< InReduceDims... > in_reduce_dims, const ReduceFunc &reduce_func, const InDataType_ &reduce_init)
template<typename T>
CK_TILE_HOST_DEVICE_EXTERN BlockReduce2D (const T &, const typename T::DataType &) -> BlockReduce2D< T >
CK_TILE_HOST float naive_attention_fwd (naive_attention_fwd_traits t, naive_attention_fwd_args a, ck_tile::stream_config s)

Variables

template<typename T>
constexpr bool is_constant_v = is_constant<T>::value
template<typename T = double>
constexpr T log2e_v = log2e<T>::value
template<typename T = double>
constexpr T log2e_rcp_v = 1. / log2e<T>::value
template<typename T>
constexpr bool is_null_tile_window_v = impl::is_null_tile_window<remove_cvref_t<T>>::value
template<typename T>
constexpr bool is_tile_window_with_static_distribution_v
 Helper variable template to check if a type is a tile window with static distribution.
template<typename T>
constexpr bool is_tile_window_with_static_lengths_v
 Helper variable template to check if a type is a tile window with static lengths.
template<typename T>
constexpr bool is_tile_window_linear_v = is_tile_window_linear<T>::value
 Helper variable template to check if a type is a linear tile window.
constexpr detail::ignore_t ignore
template<typename T>
constexpr bool is_static_v = is_static<T>::value
constexpr int ERROR_DETAIL_LIMIT = 128
 Maximum number of error values to display when checking errors.
template<typename... Ts>
constexpr bool AllConvertibleToStringView
constexpr uint32_t CUSTOM_MASK = 1U
constexpr uint32_t SLIDING_WINDOW = 2U
constexpr uint32_t LOGITS_SOFT_CAP = 4U
constexpr uint32_t ALIBI = 8U
template<typename Arch, typename AType, typename BType, typename CType, index_t warp_m, index_t warp_n, index_t warp_k>
constexpr bool has_wmma_traits_v

Typedef Documentation

◆ BF16

16-bit brain floating point type

◆ bf16_raw_t

◆ bf16_t

◆ bf16x16_t

◆ bf16x2_t

◆ bf16x32_t

◆ bf16x4_t

◆ bf16x64_t

◆ bf16x8_t

◆ BF8

8-bit brain floating point type

◆ bf8_raw_t

◆ bf8_t

using ck_tile::bf8_t = unsigned _BitInt(8)

◆ bf8x16_t

◆ bf8x2_t

◆ bf8x32_t

◆ bf8x4_t

◆ bf8x64_t

◆ bf8x8_t

◆ bfloat16_t

using ck_tile::bfloat16_t = ushort

◆ BlockFmhaBatchPrefillPipelineQRKSVSAsyncDefaultPolicy

Initial value:
true,
3,
3>
Definition block_fmha_pipeline_qx_ks_vs_custom_policy.hpp:266

◆ BlockFmhaPipelineQRKSVSAsyncDefaultPolicy

◆ BlockFmhaPipelineQRKSVSDefaultPolicy

◆ bool_constant

template<bool b>
using ck_tile::bool_constant = constant<b>

◆ copy_const_t

template<typename From, typename To>
using ck_tile::copy_const_t = typename copy_const<From, To>::type

◆ Default2DAndDynamicQuantEpilogueTraits

template<bool kPadM_, bool kPadN_, bool UseSmoothInputScale_, bool UseRawStore_ = true, bool UseMax3_ = false>
using ck_tile::Default2DAndDynamicQuantEpilogueTraits

◆ default_linear_bottom_dims

template<typename TensorView_>
using ck_tile::default_linear_bottom_dims
Initial value:
typename impl::default_linear_bottom_dims_impl<TensorView_::buffer_view::get_address_space(),
TensorView_::get_num_of_dimension()>::type
Definition tile_window_linear.hpp:947

◆ DeviceIp

using ck_tile::DeviceIp = remove_cvref_t<decltype(ck_tile::get_device_arch())>

◆ e8m0_raw_t

◆ e8m0_t

◆ ext_vector_t

template<typename T, index_t N>
using ck_tile::ext_vector_t = typename impl::ext_vector<T, N>::type

◆ F16

16-bit floating point (half precision) type

◆ F32

using ck_tile::F32 = float

32-bit floating point (single precision) type

◆ F8

8-bit floating point type

◆ FlatmmHostArgs

template<int NumberTensor = 0>
using ck_tile::FlatmmHostArgs
Initial value:
Definition flatmm_kernel.hpp:33
Definition flatmm_kernel.hpp:187

◆ fp16_hip_t

using ck_tile::fp16_hip_t = _Float16

◆ fp16_raw_t

typedef ushort ck_tile::fp16_raw_t = uint16_t

◆ fp16_t

using ck_tile::fp16_t = _Float16

◆ fp16x16_t

using ck_tile::fp16x16_t = _Float16

◆ fp16x2_t

using ck_tile::fp16x2_t = _Float16

◆ fp16x32_t

using ck_tile::fp16x32_t = _Float16

◆ fp16x4_t

using ck_tile::fp16x4_t = _Float16

◆ fp16x64_t

using ck_tile::fp16x64_t = _Float16

◆ fp16x8_t

using ck_tile::fp16x8_t = _Float16

◆ fp32_t

using ck_tile::fp32_t = float

◆ fp32x16_t

using ck_tile::fp32x16_t = float

◆ fp32x2_t

◆ fp32x32_t

using ck_tile::fp32x32_t = float

◆ fp32x4_t

using ck_tile::fp32x4_t = float

◆ fp32x64_t

using ck_tile::fp32x64_t = float

◆ fp32x8_t

using ck_tile::fp32x8_t = float

◆ fp64_t

using ck_tile::fp64_t = double

◆ fp64x2_t

using ck_tile::fp64x2_t = double

◆ fp64x4_t

using ck_tile::fp64x4_t = double

◆ fp8_raw_t

◆ fp8_t

using ck_tile::fp8_t = _BitInt(8)

◆ fp8x16_t

◆ fp8x2_t

◆ fp8x32_t

◆ fp8x4_t

◆ fp8x64_t

◆ fp8x8_t

◆ GemmAQuantPipelineProblem

template<typename ADataType_, typename AQDataType_, typename BDataType_, typename CDataType_, typename BlockGemmShape_, typename Traits_, typename QuantGroupSize_, bool TransposeC_, typename ComputeDataType_ = BDataType_, GemmPipelineScheduler Scheduler_ = GemmPipelineScheduler::Intrawave, bool HasHotLoop_ = true, TailNumber TailNum_ = TailNumber::Full>
using ck_tile::GemmAQuantPipelineProblem
Initial value:
AQDataType_,
BDataType_,
void,
CDataType_,
BlockGemmShape_,
Traits_,
QuantGroupSize_,
TransposeC_,
ComputeDataType_,
Scheduler_,
HasHotLoop_,
TailNum_>
Definition gemm_quant_pipeline_problem.hpp:33

◆ GemmBQuantPipelineProblem

template<typename ADataType_, typename BDataType_, typename BQDataType_, typename CDataType_, typename BlockGemmShape_, typename Traits_, typename QuantGroupSize_, typename ComputeDataType_ = ADataType_, GemmPipelineScheduler Scheduler_ = GemmPipelineScheduler::Intrawave, bool HasHotLoop_ = true, TailNumber TailNum_ = TailNumber::Full>
using ck_tile::GemmBQuantPipelineProblem
Initial value:
void,
BDataType_,
BQDataType_,
CDataType_,
BlockGemmShape_,
Traits_,
QuantGroupSize_,
false,
ComputeDataType_,
Scheduler_,
HasHotLoop_,
TailNum_>

◆ GemmPipelineAGmemBGmemCRegV2DefaultPolicy

◆ GemmPipelineProblem

template<typename AsDataType_, typename BsDataType_, typename EDataType_, typename BlockGemmShape_, typename Traits_, typename AElementWise_ = ck_tile::element_wise::PassThrough, typename BElementWise_ = ck_tile::element_wise::PassThrough, typename ComputeDataType_ = AsDataType_, bool FixedVectorSize_ = false, index_t VectorSizeA_ = 1, index_t VectorSizeB_ = 1>
using ck_tile::GemmPipelineProblem
Initial value:
BsDataType_,
EDataType_,
BlockGemmShape_,
Traits_,
ComputeDataType_,
AElementWise_,
BElementWise_,
FixedVectorSize_,
VectorSizeA_,
VectorSizeB_>
Definition gemm_pipeline_problem.hpp:25

◆ GemmRowColTensorQuantPipelineProblem

template<typename ADataType_, typename BDataType_, typename CDataType_, typename AccDataType_, typename BlockGemmShape_, typename Traits_, bool TransposeC_ = false, typename ComputeDataType_ = BDataType_, GemmPipelineScheduler Scheduler_ = GemmPipelineScheduler::Intrawave, bool HasHotLoop_ = true, TailNumber TailNum_ = TailNumber::Full>
using ck_tile::GemmRowColTensorQuantPipelineProblem
Initial value:
AccDataType_,
BDataType_,
AccDataType_,
CDataType_,
BlockGemmShape_,
Traits_,
TransposeC_,
ComputeDataType_,
Scheduler_,
HasHotLoop_,
TailNum_>
Definition gemm_group_quant_utils.hpp:267

◆ GroupedConvBwdDataHostArgs

Initial value:
The Grouped Conv kernel host arguments.
Definition grouped_convolution_utils.hpp:20

◆ GroupedConvBwdWeightHostArgs

◆ GroupedConvFwdHostArgs

template<typename CDElementwise = PassThrough>
using ck_tile::GroupedConvFwdHostArgs = GroupedConvHostArgs<const void*, const void*, void*, CDElementwise>

◆ half_t

using ck_tile::half_t = _Float16

◆ has_same_scalar_type

template<typename X, typename Y>
using ck_tile::has_same_scalar_type
Initial value:
std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
typename vector_traits<remove_cvref_t<Y>>::scalar_type>
Definition vector_type.hpp:90

◆ I32

32-bit signed integer type

◆ I8

8-bit signed integer type

◆ index_t

◆ InputTileDistributionTraits

template<typename TileDistributionEncoding_, typename DataType_, typename Policy = DefaultTranspose<DataType_>>
using ck_tile::InputTileDistributionTraits

◆ int16x16_t

◆ int16x2_t

◆ int16x32_t

◆ int16x4_t

◆ int16x64_t

◆ int16x8_t

◆ int32_t

◆ int32x16_t

◆ int32x2_t

◆ int32x32_t

◆ int32x4_t

◆ int32x64_t

◆ int32x8_t

◆ int8_t

◆ int8x16_t

◆ int8x2_t

◆ int8x32_t

◆ int8x4_t

◆ int8x64_t

◆ int8x8_t

◆ is_detected

template<template< class... > class Op, class... Args>
using ck_tile::is_detected = typename detail::detector<nonesuch, void, Op, Args...>::value_t

◆ is_known_at_compile_time

template<typename T>
using ck_tile::is_known_at_compile_time = is_static<T>

◆ is_static

template<typename T>
using ck_tile::is_static = impl::is_static_impl<remove_cvref_t<T>>

◆ is_tuple

template<typename T>
using ck_tile::is_tuple = decltype(std::declval<T&>().IsTuple())

◆ iter_difference_t

template<typename T>
using ck_tile::iter_difference_t = typename std::iterator_traits<remove_cvref_t<T>>::difference_type

◆ iter_reference_t

template<typename T>
using ck_tile::iter_reference_t = decltype(*std::declval<T&>())

◆ iter_value_t

template<typename T>
using ck_tile::iter_value_t = typename std::iterator_traits<remove_cvref_t<T>>::value_type

◆ long_index_t

◆ long_number

template<long_index_t v>
using ck_tile::long_number = constant<v>

◆ magic_division

◆ make_index_sequence

Initial value:
typename __make_integer_seq<impl::__integer_sequence, index_t, N>::seq_type

◆ multi_index

template<index_t N>
using ck_tile::multi_index = array<index_t, N>

◆ number

template<index_t v>
using ck_tile::number = constant<v>

◆ OutputTileDistributionTraits

template<typename TileDistributionEncoding_, typename DataType_, typename Policy = DefaultTranspose<DataType_>>
using ck_tile::OutputTileDistributionTraits

◆ PassThrough

◆ PersistentTileGemmUniversalTraits

template<bool kPadM_, bool kPadN_, bool kPadK_, bool DoubleSmemBuffer_, typename AsLayout_, typename BsLayout_, typename CLayout_, bool TransposeC_ = false, bool UseStructuredSparsity_ = false>
using ck_tile::PersistentTileGemmUniversalTraits
Initial value:
kPadN_,
kPadK_,
DoubleSmemBuffer_,
AsLayout_,
BsLayout_,
CLayout_,
TransposeC_,
UseStructuredSparsity_,
true>
Definition tile_gemm_traits.hpp:48

◆ pk_fp4_raw_t

◆ pk_fp4_t

◆ pk_fp4x16_t

using ck_tile::pk_fp4x16_t = uint8_t __attribute((ext_vector_type(16)))

◆ pk_fp4x2_t

using ck_tile::pk_fp4x2_t = uint8_t __attribute((ext_vector_type(2)))

◆ pk_fp4x32_t

using ck_tile::pk_fp4x32_t = uint8_t __attribute((ext_vector_type(32)))

◆ pk_fp4x4_t

using ck_tile::pk_fp4x4_t = uint8_t __attribute((ext_vector_type(4)))

◆ pk_fp4x8_t

using ck_tile::pk_fp4x8_t = uint8_t __attribute((ext_vector_type(8)))

◆ pk_int4x16_t

◆ pk_int4x2_t

◆ pk_int4x32_t

◆ pk_int4x4_t

◆ pk_int4x8_t

◆ QuantGroupedGemmKernelArgs

◆ remove_cv_t

template<typename T>
using ck_tile::remove_cv_t = typename std::remove_cv<T>::type

◆ remove_cvref_t

template<typename T>
using ck_tile::remove_cvref_t = remove_cv_t<std::remove_reference_t<T>>

◆ remove_pointer_t

template<typename T>
using ck_tile::remove_pointer_t = typename std::remove_pointer<T>::type

◆ remove_reference_t

template<typename T>
using ck_tile::remove_reference_t = typename std::remove_reference<T>::type

◆ safe_underlying_type_t

template<typename T>
using ck_tile::safe_underlying_type_t = typename safe_underlying_type<T, std::is_enum<T>::value>::type

◆ sequence_merge_t

template<typename... Seqs>
using ck_tile::sequence_merge_t = typename sequence_merge<Seqs...>::type

◆ statically_indexed_array

template<typename T, index_t N>
using ck_tile::statically_indexed_array = tuple_array<T, N>

◆ thread_buffer

template<typename T, index_t N>
using ck_tile::thread_buffer = tuple_array<T, N>

◆ tile_distribution_encoding_shuffle_t

template<typename encoding, typename shuffle>
using ck_tile::tile_distribution_encoding_shuffle_t
Initial value:

◆ tuple_array

template<typename T, index_t N>
using ck_tile::tuple_array = typename impl::tuple_array_impl<T, N>::type

◆ tuple_element_or_default_t

template<typename Tuple_, std::size_t Idx, typename DefaultType>
using ck_tile::tuple_element_or_default_t
Initial value:
typename detail:: tuple_element_or_default_dispatch< is_within_bounds, Idx, Tuple, DefaultType >::type type
Definition type_traits.hpp:164

◆ uint16x16_t

◆ uint16x2_t

◆ uint16x32_t

◆ uint16x4_t

◆ uint16x64_t

◆ uint16x8_t

◆ uint32x16_t

◆ uint32x2_t

◆ uint32x32_t

◆ uint32x4_t

◆ uint32x64_t

◆ uint32x8_t

◆ uint8x16_t

◆ uint8x2_t

◆ uint8x32_t

◆ uint8x4_t

◆ uint8x64_t

◆ uint8x8_t

◆ uniform_sequence_gen_t

template<index_t NSize, index_t I>
using ck_tile::uniform_sequence_gen_t = typename uniform_sequence_gen<NSize, I>::type

◆ Waitcnt

◆ WarpGemmAttributeMfmaImpl_f32_16x16x128_bf8_bf8

◆ WarpGemmAttributeMfmaImpl_f32_16x16x128_bf8_fp8

◆ WarpGemmAttributeMfmaImpl_f32_16x16x128_fp8_bf8

◆ WarpGemmAttributeMfmaImpl_f32_16x16x128_fp8_fp8

◆ WarpGemmAttributeMfmaImpl_f32_16x16x32_bf8_bf8

◆ WarpGemmAttributeMfmaImpl_f32_16x16x32_fp8_bf8

◆ WarpGemmAttributeMfmaImpl_f32_16x16x32_fp8_fp8

◆ WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_bf8

◆ WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_fp8

◆ WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_bf8

◆ WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_fp8

◆ WarpGemmAttributeMfmaImpl_f32_32x32x64_bf8_bf8

◆ WarpGemmAttributeMfmaImpl_f32_32x32x64_bf8_fp8

◆ WarpGemmAttributeMfmaImpl_f32_32x32x64_fp8_bf8

◆ WarpGemmAttributeMfmaImpl_f32_32x32x64_fp8_fp8

◆ WarpGemmAttributeWmmaImpl_f32_16x16x16_bf16_bf16

◆ WarpGemmAttributeWmmaImpl_f32_16x16x16_bf8_bf8

◆ WarpGemmAttributeWmmaImpl_f32_16x16x16_bf8_f8

◆ WarpGemmAttributeWmmaImpl_f32_16x16x16_f16_f16

◆ WarpGemmAttributeWmmaImpl_f32_16x16x16_f8_bf8

◆ WarpGemmAttributeWmmaImpl_f32_16x16x16_f8_f8

◆ WarpGemmAttributeWmmaImpl_i32_16x16x16_i8_i8

◆ WarpGemmDispatcher

template<typename AType, typename BType, typename AccType, index_t MPerWave, index_t NPerWave, index_t KPerWave, bool TransposeC, bool SwizzleA = false, bool UseStructuredSparsity = false, WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using ck_tile::WarpGemmDispatcher
Initial value:
typename impl::WarpGemmDispatcher<AType,
BType,
AccType,
MPerWave,
NPerWave,
KPerWave,
TransposeC,
SwizzleA,
UseStructuredSparsity,
AttrNumAccess>::Type
Type
Type of JSON value.
Definition rapidjson.h:760
Definition warp_gemm_dispatcher.hpp:23

◆ WarpGemmMfma_f32_16x16x128_bf8_bf8

template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using ck_tile::WarpGemmMfma_f32_16x16x128_bf8_bf8
Initial value:

◆ WarpGemmMfma_f32_16x16x128_bf8_bf8_CTransposed

template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using ck_tile::WarpGemmMfma_f32_16x16x128_bf8_bf8_CTransposed
Initial value:
AttrNumAccess>>
WarpGemmAttributeMfmaImpl_f32_16x16x128_f8_bf8_base< bf8_t, bf8_t, Ctrl_ > WarpGemmAttributeMfmaImpl_f32_16x16x128_bf8_bf8
Definition warp_gemm_attribute_mfma_impl.hpp:1624
Definition warp_gemm_attribute_mfma.hpp:395

◆ WarpGemmMfma_f32_16x16x128_bf8_fp8

◆ WarpGemmMfma_f32_16x16x128_bf8_fp8_CTransposed

template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using ck_tile::WarpGemmMfma_f32_16x16x128_bf8_fp8_CTransposed
Initial value:
AttrNumAccess>>
WarpGemmAttributeMfmaImpl_f32_16x16x128_f8_bf8_base< bf8_t, fp8_t, Ctrl_ > WarpGemmAttributeMfmaImpl_f32_16x16x128_bf8_fp8
Definition warp_gemm_attribute_mfma_impl.hpp:1620

◆ WarpGemmMfma_f32_16x16x128_fp4

◆ WarpGemmMfma_f32_16x16x128_fp8_bf8

◆ WarpGemmMfma_f32_16x16x128_fp8_bf8_CTransposed

template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using ck_tile::WarpGemmMfma_f32_16x16x128_fp8_bf8_CTransposed
Initial value:
AttrNumAccess>>
WarpGemmAttributeMfmaImpl_f32_16x16x128_f8_bf8_base< fp8_t, bf8_t, Ctrl_ > WarpGemmAttributeMfmaImpl_f32_16x16x128_fp8_bf8
Definition warp_gemm_attribute_mfma_impl.hpp:1616

◆ WarpGemmMfma_f32_16x16x128_fp8_fp8

◆ WarpGemmMfma_f32_16x16x128_fp8_fp8_CTransposed

template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using ck_tile::WarpGemmMfma_f32_16x16x128_fp8_fp8_CTransposed
Initial value:
AttrNumAccess>>
WarpGemmAttributeMfmaImpl_f32_16x16x128_f8_bf8_base< fp8_t, fp8_t, Ctrl_ > WarpGemmAttributeMfmaImpl_f32_16x16x128_fp8_fp8
Definition warp_gemm_attribute_mfma_impl.hpp:1612

◆ WarpGemmMfma_f32_16x16x32_bf8_bf8

◆ WarpGemmMfma_f32_16x16x32_bf8_bf8_CTransposed

Initial value:
WarpGemmAttributeMfmaImpl_f32_16x16x32_f8_base< bf8_t, bf8_t, Ctrl_ > WarpGemmAttributeMfmaImpl_f32_16x16x32_bf8_bf8
Definition warp_gemm_attribute_mfma_impl.hpp:1518

◆ WarpGemmMfma_f32_16x16x32_fp8_bf8

◆ WarpGemmMfma_f32_16x16x32_fp8_fp8

◆ WarpGemmMfma_f32_16x16x32_fp8_fp8_CTransposed

Initial value:
WarpGemmAttributeMfmaImpl_f32_16x16x32_f8_base< fp8_t, fp8_t, Ctrl_ > WarpGemmAttributeMfmaImpl_f32_16x16x32_fp8_fp8
Definition warp_gemm_attribute_mfma_impl.hpp:1508

◆ WarpGemmMfma_f32_16x16x64_bf8_bf8

◆ WarpGemmMfma_f32_16x16x64_fp8_fp8

◆ WarpGemmMfma_f32_32x32x16_bf8_bf8

◆ WarpGemmMfma_f32_32x32x16_bf8_bf8_CTransposed

Initial value:
WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base< bf8_t, bf8_t, Ctrl_ > WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_bf8
Definition warp_gemm_attribute_mfma_impl.hpp:1526

◆ WarpGemmMfma_f32_32x32x16_bf8_fp8

◆ WarpGemmMfma_f32_32x32x16_bf8_fp8_CTransposed

Initial value:
WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base< bf8_t, fp8_t, Ctrl_ > WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_fp8
Definition warp_gemm_attribute_mfma_impl.hpp:1522

◆ WarpGemmMfma_f32_32x32x16_fp8_bf8

◆ WarpGemmMfma_f32_32x32x16_fp8_bf8_CTransposed

Initial value:
WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base< fp8_t, bf8_t, Ctrl_ > WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_bf8
Definition warp_gemm_attribute_mfma_impl.hpp:1511

◆ WarpGemmMfma_f32_32x32x16_fp8_fp8

◆ WarpGemmMfma_f32_32x32x16_fp8_fp8_CTransposed

Initial value:
WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base< fp8_t, fp8_t, Ctrl_ > WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_fp8
Definition warp_gemm_attribute_mfma_impl.hpp:1505

◆ WarpGemmMfma_f32_32x32x32_bf8_bf8

◆ WarpGemmMfma_f32_32x32x32_fp8_bf8

◆ WarpGemmMfma_f32_32x32x32_fp8_fp8

◆ WarpGemmMfma_f32_32x32x64_bf8_bf8

◆ WarpGemmMfma_f32_32x32x64_bf8_fp8

◆ WarpGemmMfma_f32_32x32x64_fp8_bf8

◆ WarpGemmMfma_f32_32x32x64_fp8_fp8

◆ WarpGemmMfma_i32_16x16x32_i8_i8

◆ WarpGemmMfma_i32_16x16x32_i8_i8_CTransposed

◆ WarpGemmMfma_i32_32x32x16_i8_i8

◆ WarpGemmMfma_i32_32x32x16_i8_i8_CTransposed

◆ WarpGemmMfmaBf16Bf16F32M16N16K16

◆ WarpGemmMfmaBf16Bf16F32M16N16K16TransposedCDistribution

◆ WarpGemmMfmaBf16Bf16F32M16N16K32

◆ WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution

◆ WarpGemmMfmaBf16Bf16F32M32N32K16

template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using ck_tile::WarpGemmMfmaBf16Bf16F32M32N32K16

◆ WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleA

◆ WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleBTransposedCDistribution

◆ WarpGemmMfmaBf16Bf16F32M32N32K16TransposedCDistribution

◆ WarpGemmMfmaBf16Bf16F32M32N32K8

◆ WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleA

◆ WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleBTransposedCDistribution

◆ WarpGemmMfmaBf16Bf16F32M32N32K8TransposedCDistribution

◆ WarpGemmMfmaBf16Bf16F32M4N64K16

◆ WarpGemmMfmaBf16Bf16F32M64N4K16

◆ WarpGemmMfmaF16F16F32M16N16K16

◆ WarpGemmMfmaF16F16F32M16N16K16TransposedCDistribution

◆ WarpGemmMfmaF16F16F32M16N16K32

◆ WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution

◆ WarpGemmMfmaF16F16F32M32N32K16

template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using ck_tile::WarpGemmMfmaF16F16F32M32N32K16

◆ WarpGemmMfmaF16F16F32M32N32K16SwizzleA

◆ WarpGemmMfmaF16F16F32M32N32K16SwizzleBTransposedCDistribution

◆ WarpGemmMfmaF16F16F32M32N32K16TransposedCDistribution

◆ WarpGemmMfmaF16F16F32M32N32K8

◆ WarpGemmMfmaF16F16F32M32N32K8SwizzleA

◆ WarpGemmMfmaF16F16F32M32N32K8SwizzleBTransposedCDistribution

◆ WarpGemmMfmaF16F16F32M32N32K8TransposedCDistribution

◆ WarpGemmMfmaF16F16F32M4N64K16

◆ WarpGemmMfmaF16F16F32M64N4K16

◆ WarpGemmMfmaF32F32F32M16N16K16

template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using ck_tile::WarpGemmMfmaF32F32F32M16N16K16

◆ WarpGemmMfmaF32F32F32M16N16K16TransposedCDistribution

◆ WarpGemmMfmaF32F32F32M16N16K4

◆ WarpGemmMfmaFp8Fp8F32M32N32K32SwizzleBTransposedCDistribution

◆ WarpGemmSmfmacF16F16F32M16N16K32

Initial value:
Class describing structured sparsity mfma instructions.
Definition warp_gemm_attribute_smfmac.hpp:26
Definition warp_gemm_attribute_smfmac_impl.hpp:65
Definition warp_gemm_smfmac_impl.hpp:11

◆ WarpGemmSmfmacF16F16F32M32N32K16

◆ WarpGemmWmma_f32_16x16x16_bf16_bf16

◆ WarpGemmWmma_f32_16x16x16_bf8_bf8

◆ WarpGemmWmma_f32_16x16x16_bf8_f8

◆ WarpGemmWmma_f32_16x16x16_f16_f16

◆ WarpGemmWmma_f32_16x16x16_f8_bf8

◆ WarpGemmWmma_f32_16x16x16_f8_f8

◆ WarpGemmWmma_i32_16x16x16_i8_i8

Enumeration Type Documentation

◆ address_space_enum

enum struct ck_tile::address_space_enum : std::uint16_t
strong
Enumerator
generic 
global 
lds 
sgpr 
constant 
vgpr 

◆ AlibiMode

enum struct ck_tile::AlibiMode
strong
Enumerator
VERTICAL 
FROM_TOP_LEFT 
FROM_BOTTOM_RIGHT 

◆ amd_buffer_coherence_enum

Enumerator
coherence_default 
glc 
slc 
glc_slc 
WAVE_NT0 
WAVE_NT1 
GROUP_NT0 
GROUP_NT1 
DEVICE_NT0 
DEVICE_NT1 
SYSTEM_NT0 
SYSTEM_NT1 

◆ bf16_rounding_mode

enum class ck_tile::bf16_rounding_mode
strong
Enumerator
standard 
truncate_with_nan 
truncate 
standard_asm 
rta_asm 

◆ BlockAttentionBiasEnum

Enumerator
NO_BIAS 
ELEMENTWISE_BIAS 
ALIBI 

◆ BlockFmhaPipelineEnum

enum class ck_tile::BlockFmhaPipelineEnum
strong
Enumerator
QRKSVS 
QRKSVS_ASYNC 
QSKSVS 
QRKSVS_ASYNC_TRLOAD 

◆ ConvolutionSpecialization

Enumerator
Default 
Filter1x1Pad0 
Filter1x1Stride1Pad0 
Filter3x3 

◆ coord_transform_enum

enum struct ck_tile::coord_transform_enum
strong
Enumerator
undefined 
pass_through 
pad 
embed 
merge 
unmerge 
replicate 
xor_t 
offset 
indexing 

◆ fp8_interpretation

enum class ck_tile::fp8_interpretation
strong

FP8 interpretation used in conversion algorithms.

Enumerator
E4M3_OCP 
E5M2_OCP 
E4M3_FNUZ 
E5M2_FNUZ 

◆ fp8_rounding_mode

enum class ck_tile::fp8_rounding_mode
strong
Enumerator
standard 
stochastic 

◆ FusedMoeGemmPipelineSequencerEnum

Enumerator
SLD_A 
SLD_B 
GLD_A 
GLD_B 
SST_A 
SST_B 
GST_O 

◆ FusedMoeGemmWeightPermuteEnum

Enumerator
no_permute 
b_nr_kr_kw_nw_kv 
b_nr_kr_waveflatten 

◆ GemmLoopOrder

enum struct ck_tile::GemmLoopOrder
strong
Enumerator
KMN 
MNK 

◆ GemmPipeline

enum struct ck_tile::GemmPipeline
strong
Enumerator
COMPUTE_ASYNC 
COMPUTE_V3 
COMPUTE_V4 
COMPUTE_V5 
COMPUTE_V6 
MEMORY 
BASIC_V1 
BASIC_V2 
PRESHUFFLE_V2 

◆ GemmPipelineScheduler

enum struct ck_tile::GemmPipelineScheduler
strong
Enumerator
Default 
Intrawave 
Interwave 

◆ GenericAttentionMaskEnum

Enumerator
NO_MASK 
MASK_FROM_TOP_LEFT 
MASK_FROM_BOTTOM_RIGHT 
MASK_GENERIC 

◆ Layernorm2dFusedAddEnum

Enumerator
NO_ADD 
PRE_ADD_STORE 
PRE_ADD 

◆ Layernorm2dFusedQuantEnum

Enumerator
NO_SWEEP 
SMOOTH_DYNAMIC_QUANT 
DYNAMIC_QUANT 

◆ Layernorm2dXBiasEnum

enum class ck_tile::Layernorm2dXBiasEnum
strong
Enumerator
NO_BIAS 
ADD_BIAS 

◆ LLVMSchedGroupMask

Enumerator
NONE 
ALU 
VALU 
SALU 
MFMA 
VMEM 
VMEM_READ 
VMEM_WRITE 
DS 
DS_READ 
DS_WRITE 
ALL 

◆ memory_operation_enum

enum struct ck_tile::memory_operation_enum : std::uint16_t
strong
Enumerator
set 
atomic_add 
atomic_max 
add 

◆ MoeFlatmmKind

enum class ck_tile::MoeFlatmmKind
strong
Enumerator
kFFN_gemm1_gate_only 
kFFN_gemm1_gate_up 
kFFN_gemm2 

◆ naive_attention_layout_enum

Enumerator
DEFAULT 
BSHD 
BHSD 
BS3HD 
PHSD 
PHDSX 
PHDS 
SCALE_HS 
SCALE_SH 

◆ naive_attention_quant_algo

Enumerator
NO 
KV_8BIT_PERHEAD 
KV_8BIT_PERTOKEN 

◆ naive_attention_variation_enum

Enumerator
FLASH_BATCHED 
FLASH_GROUPED 
DECODE_PAGED 

◆ PositionEncodingEnum

enum struct ck_tile::PositionEncodingEnum
strong
Enumerator
NO 
ALIBI 

◆ QuantType

enum struct ck_tile::QuantType : std::uint16_t
strong
Enumerator
AQuantGrouped 
BQuantGrouped 
RowColQuant 
TensorQuant 

◆ Rmsnorm2dFusedAddEnum

enum class ck_tile::Rmsnorm2dFusedAddEnum
strong
Enumerator
NO_ADD 
PRE_ADD_STORE 
PRE_ADD 

◆ Rmsnorm2dFusedQuantEnum

Enumerator
NO_SWEEP 
SMOOTH_DYNAMIC_QUANT 
DYNAMIC_QUANT 

◆ Rmsnorm2dSensitiveEnum

Enumerator
NO_SPECIFIC_MODEL 
T5_MODEL_LIKE 

◆ RotaryEmbeddingEnum

enum class ck_tile::RotaryEmbeddingEnum
strong
Enumerator
NONE 
INTERLEAVED 
HALF_ROTATED 

◆ StreamKReductionStrategy

Enumerator
Atomic 
Reduction 

◆ TailNumber

enum struct ck_tile::TailNumber
strong
Enumerator
Odd 
Even 
One 
Two 
Three 
Four 
Five 
Six 
Seven 
Empty 
Full 

◆ tile_distribution_pattern

Enumeration describing static tile distribution patterns.

Enumerator
thread_raked 

Thread raked pattern.

warp_raked 

Warp raked pattern.

block_raked 

Block raked pattern - aka linear.

◆ WGAttrCtlEnum

enum class ck_tile::WGAttrCtlEnum
strong
Enumerator
Default_ 
Raw_vvv 
Raw_vaa 
Raw_vav 
Raw_vva 
Raw_avv 

◆ WGAttrNumAccessEnum

enum class ck_tile::WGAttrNumAccessEnum
strong
Enumerator
Single 
Double 
Quad 
Invalid 

Function Documentation

◆ abs() [1/7]

◆ abs() [2/7]

template<typename T>
CK_TILE_HOST_DEVICE T ck_tile::abs ( const T & x)

◆ abs() [3/7]

CK_TILE_DEVICE double ck_tile::abs ( double x)

◆ abs() [4/7]

CK_TILE_DEVICE float ck_tile::abs ( float x)

◆ abs() [5/7]

CK_TILE_DEVICE fp16_t ck_tile::abs ( fp16_t x)

◆ abs() [6/7]

CK_TILE_DEVICE int32_t ck_tile::abs ( int32_t x)

◆ abs() [7/7]

CK_TILE_DEVICE int8_t ck_tile::abs ( int8_t x)

◆ acos() [1/2]

template<typename T>
CK_TILE_DEVICE T ck_tile::acos ( T x)

◆ acos() [2/2]

template<typename T>
CK_TILE_HOST T ck_tile::acos ( T x)

◆ acos< double >() [1/2]

template<>
CK_TILE_DEVICE double ck_tile::acos< double > ( double x)

◆ acos< double >() [2/2]

template<>
CK_TILE_HOST double ck_tile::acos< double > ( double x)

◆ acos< float >() [1/2]

template<>
CK_TILE_DEVICE float ck_tile::acos< float > ( float x)

◆ acos< float >() [2/2]

template<>
CK_TILE_HOST float ck_tile::acos< float > ( float x)

◆ acosh() [1/2]

template<typename T>
CK_TILE_DEVICE T ck_tile::acosh ( T x)

◆ acosh() [2/2]

template<typename T>
CK_TILE_HOST T ck_tile::acosh ( T x)

◆ acosh< double >() [1/2]

template<>
CK_TILE_DEVICE double ck_tile::acosh< double > ( double x)

◆ acosh< double >() [2/2]

template<>
CK_TILE_HOST double ck_tile::acosh< double > ( double x)

◆ acosh< float >() [1/2]

template<>
CK_TILE_DEVICE float ck_tile::acosh< float > ( float x)

◆ acosh< float >() [2/2]

template<>
CK_TILE_HOST float ck_tile::acosh< float > ( float x)

◆ adaptor_coordinate_is_valid()

template<typename Adaptor, typename AdpatorCoord>
CK_TILE_HOST_DEVICE constexpr bool ck_tile::adaptor_coordinate_is_valid ( const Adaptor & adaptor,
const AdpatorCoord & coord )
constexpr

◆ adaptor_coordinate_is_valid_assuming_top_index_is_valid()

template<typename Adaptor, typename AdaptorCoord>
CK_TILE_HOST_DEVICE constexpr bool ck_tile::adaptor_coordinate_is_valid_assuming_top_index_is_valid ( const Adaptor & adaptor,
const AdaptorCoord & coord )
constexpr

◆ add()

template<typename T, typename ComputeType>
CK_TILE_HOST_DEVICE T ck_tile::add ( const T & a,
const T & b )

◆ add_bf16x2_t()

CK_TILE_HOST_DEVICE bf16x2_t ck_tile::add_bf16x2_t ( const bf16x2_t & a,
const bf16x2_t & b )

◆ add_bf16x4_t()

CK_TILE_HOST_DEVICE bf16x4_t ck_tile::add_bf16x4_t ( const bf16x4_t & a,
const bf16x4_t & b )

◆ add_bf8x4_t()

CK_TILE_HOST_DEVICE bf8x4_t ck_tile::add_bf8x4_t ( const bf8x4_t & a,
const bf8x4_t & b )

◆ add_bf8x8_t()

CK_TILE_HOST_DEVICE bf8x8_t ck_tile::add_bf8x8_t ( const bf8x8_t & a,
const bf8x8_t & b )

◆ add_f16x2_t()

CK_TILE_HOST_DEVICE fp16x2_t ck_tile::add_f16x2_t ( const fp16x2_t & a,
const fp16x2_t & b )

◆ add_fp8x4_t()

CK_TILE_HOST_DEVICE fp8x4_t ck_tile::add_fp8x4_t ( const fp8x4_t & a,
const fp8x4_t & b )

◆ add_fp8x8_t()

CK_TILE_HOST_DEVICE fp8x8_t ck_tile::add_fp8x8_t ( const fp8x8_t & a,
const fp8x8_t & b )

◆ address_space_to_string()

CK_TILE_HOST_DEVICE constexpr const char * ck_tile::address_space_to_string ( address_space_enum addr_space)
constexpr

Helper function to convert address space enum to string.

◆ amd_async_buffer_load()

template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true>
CK_TILE_DEVICE void ck_tile::amd_async_buffer_load ( CK_TILE_LDS_ADDR T * smem,
int32x4_t src_wave_buffer_resource,
index_t src_thread_addr_offset,
index_t src_wave_addr_offset,
index_t src_immediate_addr_offset = 0,
index_t flag = 0,
bool_constant< oob_conditional_check > = {} )

◆ amd_async_buffer_load_impl()

template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool pre_nop = false>
CK_TILE_DEVICE void ck_tile::amd_async_buffer_load_impl ( CK_TILE_LDS_ADDR T * smem,
int32x4_t src_wave_buffer_resource,
index_t src_thread_addr_offset,
index_t src_wave_addr_offset,
index_t src_immediate_addr_offset = 0,
bool_constant< pre_nop > = {} )

◆ amd_async_buffer_load_with_oob()

template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = false>
CK_TILE_DEVICE void ck_tile::amd_async_buffer_load_with_oob ( CK_TILE_LDS_ADDR T * smem,
const int32x4_t src_wave_buffer_resource,
index_t src_thread_element_offset,
index_t src_linear_element_offset,
bool is_valid_element,
bool_constant< oob_conditional_check > = {} )

◆ amd_async_buffer_load_with_oob_raw() [1/2]

template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool pre_nop = false>
CK_TILE_DEVICE void ck_tile::amd_async_buffer_load_with_oob_raw ( T * smem,
const int32x4_t src_wave_buffer_resource,
index_t src_thread_element_offset,
index_t src_linear_element_offset,
bool_constant< pre_nop > = {} )

◆ amd_async_buffer_load_with_oob_raw() [2/2]

template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool pre_nop = false>
CK_TILE_DEVICE void ck_tile::amd_async_buffer_load_with_oob_raw ( T * smem,
const T * p_src_wave,
index_t src_thread_element_offset,
index_t src_linear_element_offset,
index_t src_element_space_size,
bool_constant< pre_nop > = {} )

◆ amd_buffer_atomic_add()

template<typename T, index_t N>
CK_TILE_DEVICE void ck_tile::amd_buffer_atomic_add ( const thread_buffer< T, N > & src_thread_data,
T * p_dst_wave,
const index_t dst_thread_element_offset,
const bool dst_thread_element_valid,
const index_t dst_element_space_size )

◆ amd_buffer_atomic_add_impl()

template<typename T, index_t N>
CK_TILE_DEVICE void ck_tile::amd_buffer_atomic_add_impl ( const thread_buffer< T, N > & src_thread_data,
int32x4_t dst_wave_buffer_resource,
index_t dst_thread_addr_offset,
index_t dst_wave_addr_offset )

◆ amd_buffer_atomic_add_raw()

template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true, bool pre_nop = false>
CK_TILE_DEVICE void ck_tile::amd_buffer_atomic_add_raw ( const thread_buffer< T, N > & src_thread_data,
T * p_dst_wave,
const index_t dst_thread_element_offset,
const index_t dst_linear_element_offset,
const bool dst_thread_element_valid,
const index_t dst_element_space_size,
bool_constant< pre_nop > = {} )

◆ amd_buffer_atomic_max()

template<typename T, index_t N>
CK_TILE_DEVICE void ck_tile::amd_buffer_atomic_max ( const thread_buffer< T, N > & src_thread_data,
T * p_dst_wave,
const index_t dst_thread_element_offset,
const bool dst_thread_element_valid,
const index_t dst_element_space_size )

◆ amd_buffer_atomic_max_impl()

template<typename T, index_t N>
CK_TILE_DEVICE void ck_tile::amd_buffer_atomic_max_impl ( const thread_buffer< T, N > src_thread_data,
int32x4_t dst_wave_buffer_resource,
index_t dst_thread_addr_offset,
index_t dst_wave_addr_offset )

◆ amd_buffer_load_impl()

template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default>
CK_TILE_DEVICE thread_buffer< T, N > ck_tile::amd_buffer_load_impl ( int32x4_t src_wave_buffer_resource,
index_t src_thread_addr_offset,
index_t src_wave_addr_offset )

◆ amd_buffer_load_impl_with_bytes()

template<index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default>
CK_TILE_DEVICE thread_buffer< int8_t, N > ck_tile::amd_buffer_load_impl_with_bytes ( int32x4_t src_wave_buffer_resource,
index_t src_thread_addr_offset,
index_t src_wave_addr_offset )

◆ amd_buffer_load_invalid_element_return_customized_value()

template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true>
CK_TILE_DEVICE thread_buffer< T, N > ck_tile::amd_buffer_load_invalid_element_return_customized_value ( const T * p_src_wave,
index_t src_thread_element_offset,
bool src_thread_element_valid,
index_t src_element_space_size,
T customized_value )

◆ amd_buffer_load_invalid_element_return_zero()

template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true>
CK_TILE_DEVICE thread_buffer< T, N > ck_tile::amd_buffer_load_invalid_element_return_zero ( const T * p_src_wave,
index_t src_thread_element_offset,
bool src_thread_element_valid,
index_t src_element_space_size )

◆ amd_buffer_load_raw() [1/2]

template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true, bool pre_nop = false>
CK_TILE_DEVICE void ck_tile::amd_buffer_load_raw ( thread_buffer< T, N > & dst,
const int32x4_t src_wave_buffer_resource,
index_t src_thread_element_offset,
index_t src_linear_element_offset,
index_t is_valid_element = 0,
bool_constant< pre_nop > = {} )

◆ amd_buffer_load_raw() [2/2]

template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true, bool pre_nop = false>
CK_TILE_DEVICE void ck_tile::amd_buffer_load_raw ( thread_buffer< T, N > & dst,
const T * p_src_wave,
index_t src_thread_element_offset,
index_t src_linear_element_offset,
index_t src_element_space_size,
index_t is_valid_element = 0,
bool_constant< pre_nop > = {} )

◆ amd_buffer_load_raw_impl()

template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true, bool pre_nop = false>
CK_TILE_DEVICE void ck_tile::amd_buffer_load_raw_impl ( thread_buffer< T, N > & dst,
int32x4_t src_wave_buffer_resource,
index_t src_thread_addr_offset,
index_t src_wave_addr_offset,
index_t src_linear_addr_offset,
index_t flag = 0,
bool_constant< pre_nop > = {} )

◆ amd_buffer_store()

template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true>
CK_TILE_DEVICE void ck_tile::amd_buffer_store ( const thread_buffer< T, N > & src_thread_data,
T * p_dst_wave,
const index_t dst_thread_element_offset,
const bool dst_thread_element_valid,
const index_t dst_element_space_size )

◆ amd_buffer_store_impl()

template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default>
CK_TILE_DEVICE void ck_tile::amd_buffer_store_impl ( const thread_buffer< T, N > src_thread_data,
int32x4_t dst_wave_buffer_resource,
index_t dst_thread_addr_offset,
index_t dst_wave_addr_offset )

◆ amd_buffer_store_impl_with_bytes()

template<index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default>
CK_TILE_DEVICE void ck_tile::amd_buffer_store_impl_with_bytes ( const thread_buffer< int8_t, N > src_thread_data,
int32x4_t dst_wave_buffer_resource,
index_t dst_thread_addr_offset,
index_t dst_wave_addr_offset )

◆ amd_buffer_store_raw()

template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true>
CK_TILE_DEVICE void ck_tile::amd_buffer_store_raw ( const thread_buffer< T, N > & src_thread_data,
T * p_dst_wave,
const index_t dst_thread_element_offset,
const index_t dst_linear_element_offset,
const bool dst_thread_element_valid,
const index_t dst_element_space_size )

◆ amd_buffer_store_raw_impl()

template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true>
CK_TILE_DEVICE void ck_tile::amd_buffer_store_raw_impl ( const thread_buffer< T, N > & dst_thread_data,
int32x4_t dst_wave_buffer_resource,
index_t dst_thread_addr_offset,
index_t dst_wave_addr_offset,
index_t dst_linear_addr_offset,
index_t is_valid_element = 1 )

◆ amd_wave_read_first_lane() [1/5]

template<typename Object, std::enable_if_t< std::is_trivially_copyable_v< Object >, int > = 0>
__device__ auto ck_tile::amd_wave_read_first_lane ( const Object & obj)
inline

◆ amd_wave_read_first_lane() [2/5]

__device__ int32_t ck_tile::amd_wave_read_first_lane ( int32_t value)
inline

◆ amd_wave_read_first_lane() [3/5]

__device__ uint32_t ck_tile::amd_wave_read_first_lane ( uint16_t v)
inline

◆ amd_wave_read_first_lane() [4/5]

__device__ uint32_t ck_tile::amd_wave_read_first_lane ( uint32_t value)
inline

◆ amd_wave_read_first_lane() [5/5]

__device__ uint32_t ck_tile::amd_wave_read_first_lane ( uint8_t v)
inline

◆ apply()

template<typename F, typename Tuple>
decltype(auto) ck_tile::apply ( F && f,
Tuple && t )
constexpr

◆ asin() [1/2]

template<typename T>
CK_TILE_DEVICE T ck_tile::asin ( T x)

◆ asin() [2/2]

template<typename T>
CK_TILE_HOST T ck_tile::asin ( T x)

◆ asin< double >() [1/2]

template<>
CK_TILE_DEVICE double ck_tile::asin< double > ( double x)

◆ asin< double >() [2/2]

template<>
CK_TILE_HOST double ck_tile::asin< double > ( double x)

◆ asin< float >() [1/2]

template<>
CK_TILE_DEVICE float ck_tile::asin< float > ( float x)

◆ asin< float >() [2/2]

template<>
CK_TILE_HOST float ck_tile::asin< float > ( float x)

◆ asinh() [1/2]

template<typename T>
CK_TILE_DEVICE T ck_tile::asinh ( T x)

◆ asinh() [2/2]

template<typename T>
CK_TILE_HOST T ck_tile::asinh ( T x)

◆ asinh< double >() [1/2]

template<>
CK_TILE_DEVICE double ck_tile::asinh< double > ( double x)

◆ asinh< double >() [2/2]

template<>
CK_TILE_HOST double ck_tile::asinh< double > ( double x)

◆ asinh< float >() [1/2]

template<>
CK_TILE_DEVICE float ck_tile::asinh< float > ( float x)

◆ asinh< float >() [2/2]

template<>
CK_TILE_HOST float ck_tile::asinh< float > ( float x)

◆ async_buffer_load_dwordxn_v()

template<unsigned num_dwords, bool pre_nop = false>
CK_TILE_DEVICE void ck_tile::async_buffer_load_dwordxn_v ( void * smem,
int32x4_t rsrc,
index_t voffset,
index_t ,
index_t ioffset,
index_t = 0,
bool_constant< pre_nop > = {} )

◆ async_buffer_load_fence()

CK_TILE_DEVICE void ck_tile::async_buffer_load_fence ( index_t cnt = 0)

◆ async_load_fence()

CK_TILE_DEVICE auto ck_tile::async_load_fence ( index_t cnt = 0)

◆ async_load_fence_raw()

CK_TILE_DEVICE auto ck_tile::async_load_fence_raw ( index_t cnt = 0)

◆ async_load_tile()

template<typename LdsTileWindow_, typename TileWindow_, index_t i_access = -1, bool oob_conditional_check = true>
CK_TILE_DEVICE auto ck_tile::async_load_tile ( LdsTileWindow_ && lds_tile,
const TileWindow_ & tile_window,
number< i_access > = {},
bool_constant< oob_conditional_check > = {} )

◆ async_load_tile_raw()

template<typename LdsTileWindow_, typename TileWindow_, index_t i_access = -1, bool oob_conditional_check = true, bool pre_nop = false>
CK_TILE_DEVICE auto ck_tile::async_load_tile_raw ( LdsTileWindow_ && lds_tile,
const TileWindow_ & tile_window,
number< i_access > = {},
bool_constant< oob_conditional_check > = {},
bool_constant< pre_nop > = {} )

◆ atan() [1/2]

template<typename T>
CK_TILE_DEVICE T ck_tile::atan ( T x)

◆ atan() [2/2]

template<typename T>
CK_TILE_HOST T ck_tile::atan ( T x)

◆ atan< double >() [1/2]

template<>
CK_TILE_DEVICE double ck_tile::atan< double > ( double x)

◆ atan< double >() [2/2]

template<>
CK_TILE_HOST double ck_tile::atan< double > ( double x)

◆ atan< float >() [1/2]

template<>
CK_TILE_DEVICE float ck_tile::atan< float > ( float x)

◆ atan< float >() [2/2]

template<>
CK_TILE_HOST float ck_tile::atan< float > ( float x)

◆ atanh() [1/2]

template<typename T>
CK_TILE_DEVICE T ck_tile::atanh ( T x)

◆ atanh() [2/2]

template<typename T>
CK_TILE_HOST T ck_tile::atanh ( T x)

◆ atanh< double >() [1/2]

template<>
CK_TILE_DEVICE double ck_tile::atanh< double > ( double x)

◆ atanh< double >() [2/2]

template<>
CK_TILE_HOST double ck_tile::atanh< double > ( double x)

◆ atanh< float >() [1/2]

template<>
CK_TILE_DEVICE float ck_tile::atanh< float > ( float x)

◆ atanh< float >() [2/2]

template<>
CK_TILE_HOST float ck_tile::atanh< float > ( float x)

◆ atomic_add()

template<typename X>
CK_TILE_DEVICE void ck_tile::atomic_add ( X * p_dst,
const X & x )

◆ atomic_add< bf16x2_t >()

template<>
CK_TILE_DEVICE void ck_tile::atomic_add< bf16x2_t > ( bf16x2_t * p_dst,
const bf16x2_t & x )

◆ atomic_add< bf16x4_t >()

template<>
CK_TILE_DEVICE void ck_tile::atomic_add< bf16x4_t > ( bf16x4_t * p_dst,
bf16x4_t const & x )

◆ atomic_add< bf8x4_t >()

template<>
CK_TILE_DEVICE void ck_tile::atomic_add< bf8x4_t > ( bf8x4_t * p_dst,
const bf8x4_t & x )

◆ atomic_add< bf8x8_t >()

template<>
CK_TILE_DEVICE void ck_tile::atomic_add< bf8x8_t > ( bf8x8_t * p_dst,
bf8x8_t const & x )

◆ atomic_add< fp16x2_t >()

template<>
CK_TILE_DEVICE void ck_tile::atomic_add< fp16x2_t > ( fp16x2_t * p_dst,
fp16x2_t const & x )

◆ atomic_add< fp8x4_t >()

template<>
CK_TILE_DEVICE void ck_tile::atomic_add< fp8x4_t > ( fp8x4_t * p_dst,
const fp8x4_t & x )

◆ atomic_add< fp8x8_t >()

template<>
CK_TILE_DEVICE void ck_tile::atomic_add< fp8x8_t > ( fp8x8_t * p_dst,
fp8x8_t const & x )

◆ atomic_add_g()

template<typename T, index_t N>
CK_TILE_DEVICE void ck_tile::atomic_add_g ( T * p_dst,
const thread_buffer< T, N > & x )

◆ atomic_max_g()

template<typename T, index_t N>
CK_TILE_DEVICE void ck_tile::atomic_max_g ( T * p_dst,
const thread_buffer< T, N > & x )

◆ bf16_to_double()

CK_TILE_HOST_DEVICE constexpr double ck_tile::bf16_to_double ( bfloat16_t x)
constexpr

◆ bf16_to_double_raw()

CK_TILE_HOST_DEVICE constexpr double ck_tile::bf16_to_double_raw ( uint16_t x)
constexpr

◆ bf16_to_float()

CK_TILE_HOST_DEVICE constexpr float ck_tile::bf16_to_float ( bfloat16_t x)
constexpr

◆ bf16_to_float_raw()

CK_TILE_HOST_DEVICE constexpr float ck_tile::bf16_to_float_raw ( uint16_t x)
constexpr

◆ bf16_to_fp16()

CK_TILE_HOST_DEVICE constexpr half_t ck_tile::bf16_to_fp16 ( bfloat16_t x)
constexpr

◆ bf16_to_pk_fp4()

CK_TILE_HOST_DEVICE constexpr pk_fp4_t ck_tile::bf16_to_pk_fp4 ( const bf16_t & x,
float scale )
constexpr

◆ bf16x2_to_pk_fp4()

CK_TILE_HOST_DEVICE constexpr pk_fp4_t ck_tile::bf16x2_to_pk_fp4 ( const bf16x2_t & x,
float scale )
constexpr

◆ bf8_to_float()

CK_TILE_HOST_DEVICE float ck_tile::bf8_to_float ( bf8_t x)

◆ bf8_to_float_raw()

CK_TILE_HOST_DEVICE float ck_tile::bf8_to_float_raw ( uint8_t x)

◆ bit_cast()

template<typename Y, typename X>
CK_TILE_HOST_DEVICE constexpr Y ck_tile::bit_cast ( const X & x)
constexpr

◆ block_sync_lds()

template<index_t lgkmcnt = 0>
CK_TILE_DEVICE void ck_tile::block_sync_lds ( )

◆ block_sync_lds_direct_load()

template<index_t vmcnt = 0>
CK_TILE_DEVICE void ck_tile::block_sync_lds_direct_load ( )

◆ block_sync_load_raw()

CK_TILE_DEVICE void ck_tile::block_sync_load_raw ( index_t cnt = 0)

◆ block_tile_reduce() [1/2]

template<typename AccDistributedTensor_, typename InDistributedTensor_, index_t... InReduceDims, typename ReduceFunc>
CK_TILE_DEVICE void ck_tile::block_tile_reduce ( AccDistributedTensor_ & acc_tensor,
const InDistributedTensor_ & in_tensor,
sequence< InReduceDims... > ,
const ReduceFunc & reduce_func )

◆ block_tile_reduce() [2/2]

template<typename AccDataType_, typename InDistributedTensor_, index_t... InReduceDims, typename ReduceFunc, typename InDataType_>
CK_TILE_DEVICE auto ck_tile::block_tile_reduce ( const InDistributedTensor_ & in_tensor,
sequence< InReduceDims... > in_reduce_dims,
const ReduceFunc & reduce_func,
const InDataType_ & reduce_init )

◆ block_tile_reduce_sync()

template<typename AccDistributedTensor_, typename ReduceFunc, bool WithBroadcast = true, bool CrossWarp = true>
CK_TILE_DEVICE void ck_tile::block_tile_reduce_sync ( AccDistributedTensor_ & acc_tensor,
const ReduceFunc & reduce_func,
bool_constant< WithBroadcast > = {},
bool_constant< CrossWarp > = {} )

◆ block_tile_reduce_xor_sync()

template<typename AccDistributedTensor_, typename ReduceFunc>
CK_TILE_DEVICE void ck_tile::block_tile_reduce_xor_sync ( AccDistributedTensor_ & acc_tensor,
const ReduceFunc & reduce_func )

◆ block_tile_welford_calculate_max_count()

template<typename BlockShape>
CK_TILE_DEVICE constexpr index_t ck_tile::block_tile_welford_calculate_max_count ( int row_size)
constexpr

◆ block_tile_welford_post_scale_var()

template<typename VarDistributedTensor_, bool FastFdiv_ = false>
CK_TILE_DEVICE constexpr void ck_tile::block_tile_welford_post_scale_var ( VarDistributedTensor_ & var_tensor,
int count,
bool_constant< FastFdiv_ > = {} )
constexpr

◆ BlockReduce2D()

template<typename T>
CK_TILE_HOST_DEVICE_EXTERN ck_tile::BlockReduce2D ( const T & ,
const typename T::DataType &  )->BlockReduce2D< T >

◆ blockwise_gemm_kernel()

template<typename ADataType, typename BDataType, typename AccDataType, typename CDataType, typename LayoutA, typename LayoutB, typename LayoutC>
__global__ void ck_tile::blockwise_gemm_kernel ( ADataType * A,
BDataType * B,
CDataType * C,
ck_tile::index_t M,
ck_tile::index_t N,
ck_tile::index_t K,
ck_tile::index_t strideA,
ck_tile::index_t strideB,
ck_tile::index_t strideC,
ck_tile::index_t scale_granularity_m,
ck_tile::index_t scale_granularity_n,
ck_tile::index_t scale_granularity_k,
float * scale_A_ptr,
float * scale_B_ptr )

◆ buffer_load_fence() [1/2]

CK_TILE_DEVICE void ck_tile::buffer_load_fence ( index_t cnt = 0)

◆ buffer_load_fence() [2/2]

template<typename... T>
CK_TILE_DEVICE void ck_tile::buffer_load_fence ( index_t cnt = 0,
T &... o )

◆ buffer_store_fence()

CK_TILE_DEVICE void ck_tile::buffer_store_fence ( index_t cnt = 0)

◆ c_style_pointer_cast()

template<typename PY, typename PX, typename std::enable_if< std::is_pointer_v< PY > &&std::is_pointer_v< PX >, bool >::type = false>
CK_TILE_HOST_DEVICE PY ck_tile::c_style_pointer_cast ( PX p_x)

◆ calculate_reference_flat_indexing()

template<typename ADataType, typename BDataType, typename DDataType, typename EDataType, typename AccDataType, typename CDEElementWise>
void ck_tile::calculate_reference_flat_indexing ( const ck_tile::HostTensor< ADataType > & a_full_dims,
const ck_tile::HostTensor< BDataType > & b_full_dims,
const std::vector< ck_tile::HostTensor< DDataType > > & ds_full_dims_host,
ck_tile::HostTensor< EDataType > & e_full_dims_host_ref,
ck_tile::index_t G_total,
ck_tile::index_t M_total,
ck_tile::index_t N_total,
ck_tile::index_t K_total,
const CDEElementWise & cde_elementwise )

◆ calculate_reference_multi_dimensional()

template<typename ADataType, typename BDataType, typename DDataType, typename EDataType, typename AccDataType, typename CDEElementWise>
void ck_tile::calculate_reference_multi_dimensional ( const HostTensor< ADataType > & a_full_dims,
const HostTensor< BDataType > & b_full_dims,
const std::vector< HostTensor< DDataType > > & ds_full_dims_host,
HostTensor< EDataType > & e_full_dims_host_ref,
const std::vector< index_t > & G_dims,
const std::vector< index_t > & M_dims,
const std::vector< index_t > & N_dims,
const std::vector< index_t > & K_dims,
const std::vector< index_t > & A_dims,
const std::vector< index_t > & B_dims,
const std::vector< index_t > & E_dims,
const CDEElementWise & cde_elementwise )

◆ calculate_spatial_piece()

template<typename TilePartitioner>
CK_TILE_HOST SplitImagePieceInfo ck_tile::calculate_spatial_piece ( ck_tile::index_t piece_idx,
ck_tile::index_t num_d_pieces,
ck_tile::index_t num_h_pieces,
ck_tile::index_t num_w_pieces,
ck_tile::index_t base_piece_d,
ck_tile::index_t base_piece_h,
ck_tile::index_t base_piece_w,
ck_tile::index_t total_d,
ck_tile::index_t total_h,
ck_tile::index_t total_w,
ck_tile::index_t N,
ck_tile::index_t K,
ck_tile::index_t total_blocks )

Calculate piece information for split-image convolution.

Overview
Computes spatial coordinates, dimensions, and GPU block range for a single piece in split-image convolution. Handles edge pieces that may have different sizes due to non-uniform division.
Template Parameters
TilePartitionerType providing MPerBlock and NPerBlock constants
Parameters
piece_idxIndex of the piece to calculate (0-based)
num_d_piecesNumber of pieces in D dimension
num_h_piecesNumber of pieces in H dimension
num_w_piecesNumber of pieces in W dimension
base_piece_dBase size of each D piece (may differ for last piece)
base_piece_hBase size of each H piece (may differ for last piece)
base_piece_wBase size of each W piece (may differ for last piece)
total_dTotal D dimension size (output space)
total_hTotal H dimension size (output space)
total_wTotal W dimension size (output space)
NBatch size
KOutput channels
total_blocksAccumulated block count from previous pieces
Returns
SplitImagePieceInfo containing all metadata for this piece

◆ call_f_unpack_args()

template<typename F, typename T>
CK_TILE_HOST auto ck_tile::call_f_unpack_args ( F f,
T args )

◆ call_f_unpack_args_impl()

template<typename F, typename T, std::size_t... Is>
CK_TILE_HOST auto ck_tile::call_f_unpack_args_impl ( F f,
T args,
std::index_sequence< Is... >  )

◆ cast_pointer_to_constant_address_space()

template<typename T>
__host__ __device__ T CK_CONSTANT_ADDRESS_SPACE * ck_tile::cast_pointer_to_constant_address_space ( T * p)

◆ cast_pointer_to_generic_address_space()

template<typename T>
__device__ T * ck_tile::cast_pointer_to_generic_address_space ( T CK_CONSTANT_ADDRESS_SPACE * p)

◆ cast_tile()

template<typename DstType, typename SrcTensor>
CK_TILE_DEVICE auto ck_tile::cast_tile ( const SrcTensor & src_tensor)

◆ ceil() [1/2]

template<typename T>
CK_TILE_DEVICE T ck_tile::ceil ( T x)

◆ ceil() [2/2]

template<typename T>
CK_TILE_HOST T ck_tile::ceil ( T x)

◆ ceil< double >() [1/2]

template<>
CK_TILE_DEVICE double ck_tile::ceil< double > ( double x)

◆ ceil< double >() [2/2]

template<>
CK_TILE_HOST double ck_tile::ceil< double > ( double x)

◆ ceil< float >() [1/2]

template<>
CK_TILE_DEVICE float ck_tile::ceil< float > ( float x)

◆ ceil< float >() [2/2]

template<>
CK_TILE_HOST float ck_tile::ceil< float > ( float x)

◆ ceil< fp16_t >()

◆ chain_tensor_adaptors() [1/2]

template<typename TensorAdaptor0, typename TensorAdaptor1>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::chain_tensor_adaptors ( const TensorAdaptor0 & adaptor0,
const TensorAdaptor1 & adaptor1 )
constexpr

◆ chain_tensor_adaptors() [2/2]

template<typename X, typename... Xs, typename std::enable_if< sizeof...(Xs) > = 2, bool, ::type = false>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::chain_tensor_adaptors ( const X & x,
const Xs &... xs )
constexpr

◆ check_err() [1/7]

template<typename Range, typename RefRange>
std::enable_if_t<(std::is_same_v< ranges::range_value_t< Range >, ranges::range_value_t< RefRange > > &&std::is_same_v< ranges::range_value_t< Range >, bf8_t >), bool > CK_TILE_HOST ck_tile::check_err ( const Range & out,
const RefRange & ref,
const std::string & msg = "Error: Incorrect results!",
double rtol = 1e-3,
double atol = 1e-3,
bool allow_infinity_ref = false )

Check errors between BF8 ranges.

Specialized comparison for 8-bit brain floating point values that considers the specific numerical properties and error characteristics of the BF8 format.

Template Parameters
RangeType of output range
RefRangeType of reference range
Parameters
outOutput range to check
refReference range to check against
msgError message to display if check fails
rtolRelative tolerance
atolAbsolute tolerance
allow_infinity_refWhether to allow infinity in reference values
Returns
True if check passes, false otherwise

◆ check_err() [2/7]

template<typename Range, typename RefRange>
std::enable_if< std::is_same_v< ranges::range_value_t< Range >, ranges::range_value_t< RefRange > > &&std::is_same_v< ranges::range_value_t< Range >, half_t >, bool >::type CK_TILE_HOST ck_tile::check_err ( const Range & out,
const RefRange & ref,
const std::string & msg = "Error: Incorrect results!",
double rtol = 1e-3,
double atol = 1e-3,
bool allow_infinity_ref = false )

Check errors between half precision floating point ranges.

Compares two ranges of half precision floating point values within specified tolerances. This specialization handles the specific requirements and characteristics of half precision floating point comparisons.

Template Parameters
RangeType of output range
RefRangeType of reference range
Parameters
outOutput range to check
refReference range to check against
msgError message to display if check fails
rtolRelative tolerance
atolAbsolute tolerance
allow_infinity_refWhether to allow infinity in reference values
Returns
True if check passes, false otherwise

◆ check_err() [3/7]

template<typename Range, typename RefRange>
std::enable_if< std::is_same_v< ranges::range_value_t< Range >, ranges::range_value_t< RefRange > > &&std::is_same_v< ranges::range_value_t< Range >, bf16_t >, bool >::type CK_TILE_HOST ck_tile::check_err ( const Range & out,
const RefRange & ref,
const std::string & msg = "Error: Incorrect results!",
double rtol = 1e-3,
double atol = 1e-3,
bool allow_infinity_ref = false )

Check errors between floating point ranges using the specified tolerances.

Compares two ranges of brain floating point values within specified relative and absolute tolerances.

Template Parameters
RangeType of output range
RefRangeType of reference range
Parameters
outOutput range to check
refReference range to check against
msgError message to display if check fails
rtolRelative tolerance
atolAbsolute tolerance
allow_infinity_refWhether to allow infinity in reference values
Returns
True if check passes, false otherwise

◆ check_err() [4/7]

template<typename Range, typename RefRange>
std::enable_if< std::is_same_v< ranges::range_value_t< Range >, ranges::range_value_t< RefRange > > &&std::is_floating_point_v< ranges::range_value_t< Range > > &&!std::is_same_v< ranges::range_value_t< Range >, half_t >, bool >::type CK_TILE_HOST ck_tile::check_err ( const Range & out,
const RefRange & ref,
const std::string & msg = "Error: Incorrect results!",
double rtol = 1e-5,
double atol = 3e-6,
bool allow_infinity_ref = false )

Check errors between floating point ranges using the specified tolerances.

Compares two ranges of floating point values within specified relative and absolute tolerances. This overload handles standard floating point types except half precision floating point.

Template Parameters
RangeType of output range
RefRangeType of reference range
Parameters
outOutput range to check
refReference range to check against
msgError message to display if check fails
rtolRelative tolerance
atolAbsolute tolerance
allow_infinity_refWhether to allow infinity in reference values
Returns
True if check passes, false otherwise

◆ check_err() [5/7]

template<typename Range, typename RefRange>
std::enable_if_t<(std::is_same_v< ranges::range_value_t< Range >, ranges::range_value_t< RefRange > > &&std::is_integral_v< ranges::range_value_t< Range > > &&!std::is_same_v< ranges::range_value_t< Range >, bf16_t >), bool > CK_TILE_HOST ck_tile::check_err ( const Range & out,
const RefRange & ref,
const std::string & msg = "Error: Incorrect results!",
double = 0,
double atol = 0 )

Check errors between integer ranges.

Compares two ranges of integer values with an absolute tolerance. This specialization handles integer types and optionally int4_t when the experimental bit int extension is enabled.

Template Parameters
RangeType of output range
RefRangeType of reference range
Parameters
outOutput range to check
refReference range to check against
msgError message to display if check fails
atolAbsolute tolerance
Returns
True if check passes, false otherwise

◆ check_err() [6/7]

template<typename Range, typename RefRange>
std::enable_if_t<(std::is_same_v< ranges::range_value_t< Range >, ranges::range_value_t< RefRange > > &&std::is_same_v< ranges::range_value_t< Range >, pk_fp4_t >), bool > CK_TILE_HOST ck_tile::check_err ( const Range & out,
const RefRange & ref,
const std::string & msg = "Error: Incorrect results!",
double = 0,
double = 0 )

Check errors between pk_fp4_t ranges.

Compares two ranges of pk_fp4_t without tolerance. This specialization handles ck_tile::pk_fp4_t type.

Template Parameters
RangeType of output range
RefRangeType of reference range
Parameters
outOutput range to check
refReference range to check against
msgError message to display if check fails
Returns
True if check passes, false otherwise

◆ check_err() [7/7]

template<typename Range, typename RefRange>
std::enable_if_t<(std::is_same_v< ranges::range_value_t< Range >, ranges::range_value_t< RefRange > > &&std::is_same_v< ranges::range_value_t< Range >, fp8_t >), bool > CK_TILE_HOST ck_tile::check_err ( const Range & out,
const RefRange & ref,
const std::string & msg = "Error: Incorrect results!",
unsigned max_rounding_point_distance = 1,
double atol = 1e-1,
bool allow_infinity_ref = false )

Check errors between FP8 ranges.

Specialized comparison for 8-bit floating point values that takes into account the unique characteristics and limitations of FP8 arithmetic, including rounding point distances and special handling of infinity values.

Template Parameters
RangeType of output range
RefRangeType of reference range
Parameters
outOutput range to check
refReference range to check against
msgError message to display if check fails
max_rounding_point_distanceMaximum allowed distance between rounding points
atolAbsolute tolerance
allow_infinity_refWhether to allow infinity in reference values
Returns
True if check passes, false otherwise

◆ check_size_mismatch()

template<typename Range, typename RefRange>
CK_TILE_HOST bool ck_tile::check_size_mismatch ( const Range & out,
const RefRange & ref,
const std::string & msg = "Error: Incorrect results!" )

Check for size mismatch between output and reference ranges.

Verifies that the output and reference ranges are the same size.

Template Parameters
RangeType of output range
RefRangeType of reference range
Parameters
outOutput range to check
refReference range to check against
msgError message to display if sizes mismatch
Returns
True if sizes mismatch, false otherwise

◆ check_wmma_supported()

template<typename ADataType, typename BDataType, typename AccDataType, index_t M_Warp_Tile, index_t N_Warp_Tile, index_t K_Warp_Tile>
CK_TILE_HOST bool ck_tile::check_wmma_supported ( )

◆ CK_PRINT() [1/2]

template<typename... type>
void ck_tile::CK_PRINT ( )
inlineconstexpr

◆ CK_PRINT() [2/2]

template<auto... val>
void ck_tile::CK_PRINT ( )
inlineconstexpr

◆ CK_TILE_ERROR()

template<typename... Args>
void ck_tile::CK_TILE_ERROR ( Args &&... args)
noexcept

◆ clamp()

template<typename T>
CK_TILE_HOST_DEVICE constexpr T ck_tile::clamp ( const T & x,
const T & lowerbound,
const T & upperbound )
constexpr

◆ clear_tile()

template<typename DstrTensors>
CK_TILE_DEVICE void ck_tile::clear_tile ( DstrTensors & dstr_tensor)

◆ clz()

CK_TILE_DEVICE int ck_tile::clz ( uint32_t x)

◆ cmp_lt_to_exec()

template<typename X, typename Y>
CK_TILE_DEVICE auto ck_tile::cmp_lt_to_exec ( const X & x,
const Y & y )

◆ composes()

template<typename... Ts>
__host__ __device__ ck_tile::composes ( Ts && ...) ->composes< remove_cvref_t< Ts >... >

FIXME: create macro to replace 'host device' and nothing more.

◆ concat() [1/4]

template<typename... Ts>
auto ck_tile::concat ( const Ts &... xs) ->std::enable_if_t< AllConvertibleToStringView< Ts... >, std::string >
nodiscard

◆ concat() [2/4]

template<typename... Ts>
auto ck_tile::concat ( const Ts &... xs) ->std::enable_if_t<!AllConvertibleToStringView< Ts... >, std::string >
nodiscard

◆ concat() [3/4]

template<typename Sep, typename First, typename... Rest>
auto ck_tile::concat ( Sep sep,
const First & first,
const Rest &... rest )->std::enable_if_t< AllConvertibleToStringView< First, Rest... >, std::string >
nodiscard

◆ concat() [4/4]

template<typename Sep, typename First, typename... Rest>
auto ck_tile::concat ( Sep sep,
const First & first,
const Rest &... rest )->std::enable_if_t<!AllConvertibleToStringView< First, Rest... >, std::string >
nodiscard

◆ concat_tuple() [1/3]

template<typename... X>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::concat_tuple ( const tuple< X... > & tx)
constexpr

◆ concat_tuple() [2/3]

template<typename... X, typename... Y>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::concat_tuple ( const tuple< X... > & tx,
const tuple< Y... > & ty )
constexpr

◆ concat_tuple() [3/3]

template<typename... X, typename... Tuples>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::concat_tuple ( const tuple< X... > & tx,
const Tuples &... tuples )
constexpr

◆ concat_tuple_of_reference()

template<typename... X, typename... Y>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::concat_tuple_of_reference ( const tuple< X &... > & tx,
const tuple< Y &... > & ty )
constexpr

◆ concatInto()

template<typename... Ts>
auto ck_tile::concatInto ( std::string & result,
const Ts &... xs )->std::enable_if_t< AllConvertibleToStringView< Ts... >, void >

◆ conditional_expr()

template<bool predicate, typename X, typename Y>
auto ck_tile::conditional_expr ( X && x,
Y && y )
constexpr

◆ constexpr_strlen()

size_t ck_tile::constexpr_strlen ( const char * c)
constexpr

◆ construct_f_unpack_args()

template<typename F, typename T>
CK_TILE_HOST auto ck_tile::construct_f_unpack_args ( F ,
T args )

◆ construct_f_unpack_args_impl()

template<typename F, typename T, std::size_t... Is>
CK_TILE_HOST auto ck_tile::construct_f_unpack_args_impl ( T args,
std::index_sequence< Is... >  )

◆ container_concat() [1/4]

template<typename T, index_t NX, index_t NY>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::container_concat ( const array< T, NX > & ax,
const array< T, NY > & ay )
constexpr

◆ container_concat() [2/4]

template<typename Container>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::container_concat ( const Container & x)
constexpr

◆ container_concat() [3/4]

template<typename... X, typename... Y>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::container_concat ( const tuple< X... > & tx,
const tuple< Y... > & ty )
constexpr

◆ container_concat() [4/4]

template<typename X, typename... Ys>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::container_concat ( const X & x,
const Ys &... ys )
constexpr

◆ container_find()

template<index_t... Is>
index_t ck_tile::container_find ( sequence< Is... > seq,
index_t value )
constexpr

◆ container_push_back() [1/2]

template<typename TData, index_t NSize>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::container_push_back ( const array< TData, NSize > & a,
const TData & x )
constexpr

◆ container_push_back() [2/2]

template<typename... Ts, typename T>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::container_push_back ( const tuple< Ts... > & a,
const T & x )
constexpr

◆ container_push_front()

template<typename... Ts, typename T>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::container_push_front ( const tuple< Ts... > & a,
const T & x )
constexpr

◆ container_reduce()

template<typename Container, typename Reduce, typename Init, index_t IBegin = 0, index_t IEnd = Container::size(), index_t IStep = 1>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::container_reduce ( const Container & x,
Reduce reduce,
Init init,
number< IBegin > = number<0>{},
number< IEnd > = number<Container::size()>{},
number< IStep > = number<1>{} )
constexpr

◆ container_reduce_impl()

template<typename Container, typename Reduce, typename ROld, index_t I, index_t IEnd, index_t IStep>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::container_reduce_impl ( const Container & x,
Reduce reduce,
ROld r_old,
number< I > i,
number< IEnd > ,
number< IStep >  )
constexpr

◆ container_reorder_given_new2old() [1/4]

template<typename TData, index_t NSize>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::container_reorder_given_new2old ( const array< TData, NSize > & old_array,
const map< index_t, index_t > & new2old )
constexpr

◆ container_reorder_given_new2old() [2/4]

template<typename TData, index_t NSize, index_t... IRs>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::container_reorder_given_new2old ( const array< TData, NSize > & old_array,
sequence< IRs... >  )
constexpr

◆ container_reorder_given_new2old() [3/4]

template<typename... Ts, index_t... IRs>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::container_reorder_given_new2old ( const tuple< Ts... > & old_tuple,
sequence< IRs... >  )
constexpr

◆ container_reorder_given_new2old() [4/4]

template<index_t... Is, index_t... IRs>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::container_reorder_given_new2old ( sequence< Is... > ,
sequence< IRs... >  )
constexpr

◆ container_reorder_given_old2new() [1/4]

template<typename TData, index_t NSize>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::container_reorder_given_old2new ( const array< TData, NSize > & old_array,
const map< index_t, index_t > & old2new )
constexpr

◆ container_reorder_given_old2new() [2/4]

template<typename TData, index_t NSize, index_t... IRs>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::container_reorder_given_old2new ( const array< TData, NSize > & old_array,
sequence< IRs... > old2new )
constexpr

◆ container_reorder_given_old2new() [3/4]

template<typename... Ts, index_t... IRs>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::container_reorder_given_old2new ( const tuple< Ts... > & old_tuple,
sequence< IRs... > old2new )
constexpr

◆ container_reorder_given_old2new() [4/4]

template<index_t... Is, index_t... IRs>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::container_reorder_given_old2new ( sequence< Is... > old_seq,
sequence< IRs... >  )
constexpr

◆ container_reverse_exclusive_scan() [1/3]

template<typename TData, index_t NSize, typename Reduce, typename Init>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::container_reverse_exclusive_scan ( const array< TData, NSize > & x,
Reduce f,
Init init )
constexpr

◆ container_reverse_exclusive_scan() [2/3]

template<index_t... Is, typename Reduce, index_t Init>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::container_reverse_exclusive_scan ( const sequence< Is... > & seq,
Reduce f,
number< Init >  )
constexpr

◆ container_reverse_exclusive_scan() [3/3]

template<typename... Xs, typename Reduce, typename Init>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::container_reverse_exclusive_scan ( const tuple< Xs... > & x,
Reduce reduce,
Init init )
constexpr

◆ container_reverse_exclusive_scan_impl()

template<typename... Xs, typename Reduce, index_t I, typename YOld, typename ROld>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::container_reverse_exclusive_scan_impl ( const tuple< Xs... > & x,
Reduce reduce,
number< I > i,
YOld y_old,
ROld r_old )
constexpr

◆ container_reverse_inclusive_scan() [1/2]

template<typename TData, index_t NSize, typename Reduce>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::container_reverse_inclusive_scan ( const array< TData, NSize > & x,
Reduce f,
TData init )
constexpr

◆ container_reverse_inclusive_scan() [2/2]

template<typename... Xs, typename Reduce, typename TData>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::container_reverse_inclusive_scan ( const tuple< Xs... > & x,
Reduce f,
TData init )
constexpr

◆ convert_to_float()

template<typename T>
CK_TILE_HOST_DEVICE float ck_tile::convert_to_float ( typename T::raw_type data,
float scale = 1.f )

◆ convert_to_type()

template<typename T>
CK_TILE_HOST_DEVICE T::raw_type ck_tile::convert_to_type ( float value,
float scale = 1.f )

◆ coordinate_has_valid_offset()

template<typename TensorDesc, typename TensorCoord>
CK_TILE_HOST_DEVICE constexpr bool ck_tile::coordinate_has_valid_offset ( const TensorDesc & tensor_desc,
const TensorCoord & coord )
constexpr

◆ coordinate_has_valid_offset_assuming_top_index_is_valid()

template<typename TensorDesc, typename TensorCoord>
CK_TILE_HOST_DEVICE constexpr bool ck_tile::coordinate_has_valid_offset_assuming_top_index_is_valid ( const TensorDesc & tensor_desc,
const TensorCoord & coord )
constexpr

◆ cos()

template<typename T>
CK_TILE_HOST T ck_tile::cos ( T x)

◆ cos< double >()

template<>
CK_TILE_HOST double ck_tile::cos< double > ( double x)

◆ cos< float >()

template<>
CK_TILE_HOST float ck_tile::cos< float > ( float x)

◆ cosh() [1/2]

template<typename T>
CK_TILE_DEVICE T ck_tile::cosh ( T x)

◆ cosh() [2/2]

template<typename T>
CK_TILE_HOST T ck_tile::cosh ( T x)

◆ cosh< double >() [1/2]

template<>
CK_TILE_DEVICE double ck_tile::cosh< double > ( double x)

◆ cosh< double >() [2/2]

template<>
CK_TILE_HOST double ck_tile::cosh< double > ( double x)

◆ cosh< float >() [1/2]

template<>
CK_TILE_DEVICE float ck_tile::cosh< float > ( float x)

◆ cosh< float >() [2/2]

template<>
CK_TILE_HOST float ck_tile::cosh< float > ( float x)

◆ double_to_bf16()

template<bf16_rounding_mode rounding = static_cast<bf16_rounding_mode>(CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT)>
CK_TILE_HOST_DEVICE constexpr bfloat16_t ck_tile::double_to_bf16 ( double f,
constant< rounding > = {} )
constexpr

◆ double_to_bf16_raw()

template<bf16_rounding_mode rounding = static_cast<bf16_rounding_mode>(CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT)>
CK_TILE_HOST_DEVICE constexpr uint16_t ck_tile::double_to_bf16_raw ( double f,
constant< rounding > = {} )
constexpr

◆ double_to_fp16()

CK_TILE_HOST_DEVICE constexpr half_t ck_tile::double_to_fp16 ( const double & x)
constexpr

◆ double_to_fp16_hip()

CK_TILE_HOST_DEVICE constexpr fp16_hip_t ck_tile::double_to_fp16_hip ( const double & x)
constexpr

◆ DS_READ_TR_SIZE()

int ck_tile::DS_READ_TR_SIZE ( )
constexpr

◆ embed_tuples()

template<typename F, typename X>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::embed_tuples ( F f,
const X & x )
constexpr

◆ EnvGetString()

template<class EnvVar>
const std::string & ck_tile::EnvGetString ( EnvVar )
inline

◆ EnvIsDisabled()

template<class EnvVar>
bool ck_tile::EnvIsDisabled ( EnvVar )
inline

◆ EnvIsEnabled()

template<class EnvVar>
bool ck_tile::EnvIsEnabled ( EnvVar )
inline

◆ EnvIsUnset()

template<class EnvVar>
bool ck_tile::EnvIsUnset ( EnvVar )
inline

◆ EnvUnset()

template<class EnvVar>
void ck_tile::EnvUnset ( EnvVar )

◆ EnvValue()

template<class EnvVar>
uint64_t ck_tile::EnvValue ( EnvVar )
inline

◆ equal()

__host__ __device__ ck_tile::equal ( ) ->equal< void, void >

FIXME: create macro to replace 'host device' and nothing more.

◆ estimate_num_wgs_per_tile()

template<ck_tile::StreamKReductionStrategy ReductionStrategy>
ck_tile::index_t ck_tile::estimate_num_wgs_per_tile ( index_t sk_ctas,
index_t iters_per_sk_cta,
index_t iters_per_tile )

Estimates the number of Stream-K workgroups per macro tile in the C tensor.

Parameters
sk_ctasNumber of Stream-K workgroups.
iters_per_sk_ctaNumber of iterations per Stream-K workgroup.
iters_per_tileNumber of iterations per tile (i.e., the number of macro tiles in the K dimension).
Returns
ck_tile::index_t An estimate of the number of workgroups per macro tile in the C tensor.
Note
It is assumed that iters_per_sk_cta > 0.

◆ exclusive_scan_sequence()

template<typename Seq, typename Reduce, index_t Init>
auto ck_tile::exclusive_scan_sequence ( Seq ,
Reduce ,
number< Init >  )
constexpr

◆ exp() [1/3]

◆ exp() [2/3]

template<typename T>
CK_TILE_DEVICE T ck_tile::exp ( T x)

◆ exp() [3/3]

template<typename T>
CK_TILE_HOST T ck_tile::exp ( T x)

◆ exp2() [1/2]

CK_TILE_DEVICE bfloat16_t ck_tile::exp2 ( bfloat16_t x)

◆ exp2() [2/2]

CK_TILE_HOST float ck_tile::exp2 ( float x)

◆ exp< double >() [1/2]

template<>
CK_TILE_DEVICE double ck_tile::exp< double > ( double x)

◆ exp< double >() [2/2]

template<>
CK_TILE_HOST double ck_tile::exp< double > ( double x)

◆ exp< float >() [1/2]

template<>
CK_TILE_DEVICE float ck_tile::exp< float > ( float x)

◆ exp< float >() [2/2]

template<>
CK_TILE_HOST float ck_tile::exp< float > ( float x)

◆ exp< fp16_t >()

◆ expm1() [1/2]

template<typename T>
CK_TILE_DEVICE T ck_tile::expm1 ( T x)

◆ expm1() [2/2]

template<typename T>
CK_TILE_HOST T ck_tile::expm1 ( T x)

◆ expm1< double >() [1/2]

template<>
CK_TILE_DEVICE double ck_tile::expm1< double > ( double x)

◆ expm1< double >() [2/2]

template<>
CK_TILE_HOST double ck_tile::expm1< double > ( double x)

◆ expm1< float >() [1/2]

template<>
CK_TILE_DEVICE float ck_tile::expm1< float > ( float x)

◆ expm1< float >() [2/2]

template<>
CK_TILE_HOST float ck_tile::expm1< float > ( float x)

◆ flag_to_exec()

template<typename T>
CK_TILE_DEVICE auto ck_tile::flag_to_exec ( const T & v_flag)

◆ float_to_bf16()

template<bf16_rounding_mode rounding = static_cast<bf16_rounding_mode>(CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT)>
CK_TILE_HOST_DEVICE constexpr bfloat16_t ck_tile::float_to_bf16 ( float f,
constant< rounding > = {} )
constexpr

◆ float_to_bf16_raw()

template<bf16_rounding_mode rounding = static_cast<bf16_rounding_mode>(CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT)>
CK_TILE_HOST_DEVICE constexpr uint16_t ck_tile::float_to_bf16_raw ( float f,
constant< rounding > = {} )
constexpr

◆ float_to_bf16_rta_asm()

CK_TILE_DEVICE uint16_t ck_tile::float_to_bf16_rta_asm ( float f)

◆ float_to_bf16_rtn_asm()

CK_TILE_DEVICE uint16_t ck_tile::float_to_bf16_rtn_asm ( float f)
constexpr

◆ float_to_bf16_rtn_raw()

CK_TILE_HOST_DEVICE constexpr uint16_t ck_tile::float_to_bf16_rtn_raw ( float f)
constexpr

◆ float_to_bf16_truc_nan_raw()

CK_TILE_HOST_DEVICE constexpr uint16_t ck_tile::float_to_bf16_truc_nan_raw ( float f)
constexpr

◆ float_to_bf16_truc_raw()

CK_TILE_HOST_DEVICE constexpr uint16_t ck_tile::float_to_bf16_truc_raw ( float f)
constexpr

◆ float_to_bf8()

template<fp8_rounding_mode rounding = static_cast<fp8_rounding_mode>(CK_TILE_FLOAT_TO_FP8_DEFAULT)>
CK_TILE_HOST_DEVICE bf8_t ck_tile::float_to_bf8 ( float x,
constant< rounding > = {} )

◆ float_to_bf8_raw() [1/2]

template<fp8_rounding_mode rounding>
CK_TILE_HOST_DEVICE bf8_raw_t ck_tile::float_to_bf8_raw ( float x,
constant< rounding >  )

◆ float_to_bf8_raw() [2/2]

template<fp8_rounding_mode rounding = static_cast<fp8_rounding_mode>(CK_TILE_FLOAT_TO_FP8_DEFAULT)>
CK_TILE_HOST_DEVICE uint8_t ck_tile::float_to_bf8_raw ( float x,
constant< rounding > = {} )

◆ float_to_fp16()

CK_TILE_HOST_DEVICE constexpr half_t ck_tile::float_to_fp16 ( const float & x)
constexpr

◆ float_to_fp16_hip()

CK_TILE_HOST_DEVICE constexpr fp16_hip_t ck_tile::float_to_fp16_hip ( const float & x)
constexpr

◆ float_to_fp8()

template<fp8_rounding_mode rounding = static_cast<fp8_rounding_mode>(CK_TILE_FLOAT_TO_FP8_DEFAULT)>
CK_TILE_HOST_DEVICE fp8_t ck_tile::float_to_fp8 ( float x,
constant< rounding > = {} )

◆ float_to_fp8_raw() [1/2]

template<fp8_rounding_mode rounding>
CK_TILE_HOST_DEVICE fp8_raw_t ck_tile::float_to_fp8_raw ( float x,
constant< rounding >  )

◆ float_to_fp8_raw() [2/2]

template<fp8_rounding_mode rounding = static_cast<fp8_rounding_mode>(CK_TILE_FLOAT_TO_FP8_DEFAULT)>
CK_TILE_HOST_DEVICE uint8_t ck_tile::float_to_fp8_raw ( float x,
constant< rounding > = {} )

◆ float_to_fp8_rtn_raw()

template<typename SrcT, typename DstT>
CK_TILE_HOST_DEVICE numeric_traits< DstT >::bitwise_type ck_tile::float_to_fp8_rtn_raw ( SrcT x)

Converts a floating-point value to an 8-bit floating-point representation with rounding to nearest even.

This function converts a floating-point value (float or half_t) to an 8-bit floating-point representation of type fp8_t or bf8_t. The conversion process may involve clipping.

Template Parameters
DstTThe destination type (fp8_t or bf8_t).
SrcTThe source type (float or half_t) to be converted.
Parameters
xThe floating-point value to be converted.
Returns
The 8-bit floating-point representation of the input value.

◆ float_to_fp8_sr_raw()

template<typename SrcT, typename DstT>
CK_TILE_HOST_DEVICE numeric_traits< DstT >::bitwise_type ck_tile::float_to_fp8_sr_raw ( SrcT x)

Converts a floating-point value to an 8-bit floating-point representation with stochastic rounding.

This function converts a floating-point value (float or half_t) to an 8-bit floating-point representation of type fp8_t or bf8_t. The conversion process may involve clipping and uses a pseudo-random number generator for the stochastic rounding.

Template Parameters
DstTThe destination type (fp8_t or bf8_t).
SrcTThe source type (float or half_t) to be converted.
Parameters
xThe floating-point value to be converted.
Returns
The 8-bit floating-point representation of the input value.

◆ float_to_int8()

CK_TILE_HOST_DEVICE constexpr int8_t ck_tile::float_to_int8 ( const float & x)
constexpr

◆ float_to_mxfp4()

CK_TILE_HOST_DEVICE constexpr pk_fp4_raw_t ck_tile::float_to_mxfp4 ( float x,
float scale )
constexpr

◆ float_to_pk_fp4()

CK_TILE_HOST_DEVICE constexpr pk_fp4_t ck_tile::float_to_pk_fp4 ( const float & x,
float scale = 1.f )
constexpr

◆ floor() [1/2]

template<typename T>
CK_TILE_DEVICE T ck_tile::floor ( T x)

◆ floor() [2/2]

template<typename T>
CK_TILE_HOST T ck_tile::floor ( T x)

◆ floor< double >() [1/2]

template<>
CK_TILE_DEVICE double ck_tile::floor< double > ( double x)

◆ floor< double >() [2/2]

template<>
CK_TILE_HOST double ck_tile::floor< double > ( double x)

◆ floor< float >() [1/2]

template<>
CK_TILE_DEVICE float ck_tile::floor< float > ( float x)

◆ floor< float >() [2/2]

template<>
CK_TILE_HOST float ck_tile::floor< float > ( float x)

◆ floor< fp16_t >()

◆ flush_icache()

void ck_tile::flush_icache ( )
inline

◆ fnv1a_hash()

unsigned int ck_tile::fnv1a_hash ( std::string_view str,
unsigned int h = 2166136261u )
constexpr

◆ fp16_to_bf16()

template<bf16_rounding_mode rounding = static_cast<bf16_rounding_mode>(CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT)>
CK_TILE_HOST_DEVICE bfloat16_t constexpr ck_tile::fp16_to_bf16 ( half_t f,
constant< rounding > = {} )
constexpr

◆ fp16_to_double()

CK_TILE_HOST_DEVICE constexpr float ck_tile::fp16_to_double ( const half_t & x)
constexpr

◆ fp16_to_double_hip()

CK_TILE_HOST_DEVICE constexpr double ck_tile::fp16_to_double_hip ( const fp16_hip_t & x)
constexpr

◆ fp16_to_float()

CK_TILE_HOST_DEVICE constexpr float ck_tile::fp16_to_float ( const half_t & x)
constexpr

◆ fp16_to_float_hip()

CK_TILE_HOST_DEVICE constexpr float ck_tile::fp16_to_float_hip ( const fp16_hip_t & x)
constexpr

◆ fp16_to_pk_fp4()

CK_TILE_HOST_DEVICE constexpr pk_fp4_t ck_tile::fp16_to_pk_fp4 ( const fp16_t & x,
float scale )
constexpr

◆ fp16x2_to_pk_fp4()

CK_TILE_HOST_DEVICE constexpr pk_fp4_t ck_tile::fp16x2_to_pk_fp4 ( const fp16x2_t & x,
float scale )
constexpr

◆ fp32x2_to_pk_fp4()

CK_TILE_HOST_DEVICE constexpr pk_fp4_t ck_tile::fp32x2_to_pk_fp4 ( const fp32x2_t & x,
float scale )
constexpr

◆ fp8_to_float()

CK_TILE_HOST_DEVICE float ck_tile::fp8_to_float ( fp8_t x)

◆ fp8_to_float_raw()

CK_TILE_HOST_DEVICE float ck_tile::fp8_to_float_raw ( uint8_t x)

◆ gcd() [1/3]

CK_TILE_HOST_DEVICE constexpr index_t ck_tile::gcd ( index_t x,
index_t y )
constexpr

◆ gcd() [2/3]

template<index_t X, index_t Y>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::gcd ( number< X > ,
number< Y >  )
constexpr

◆ gcd() [3/3]

template<typename X, typename... Ys, typename std::enable_if< sizeof...(Ys) > = 2, bool, ::type = false>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::gcd ( X x,
Ys... ys )
constexpr

◆ gemm_prec_str()

template<typename ADataType_, typename BDataType_>
std::string ck_tile::gemm_prec_str ( )

◆ generate_array()

template<typename F, index_t N>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::generate_array ( F && f,
number< N >  )
constexpr

◆ generate_sequence()

template<typename F, index_t N>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::generate_sequence ( F ,
number< N >  )
constexpr

◆ generate_sequence_v2()

template<typename F, index_t N>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::generate_sequence_v2 ( F && f,
number< N >  )
constexpr

◆ generate_tie()

template<typename F, index_t N>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::generate_tie ( F && f,
number< N >  )
constexpr

◆ generate_tuple()

template<typename F, index_t N>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::generate_tuple ( F && f,
number< N >  )
constexpr

◆ generate_tuple_for()

template<typename F, index_t... ids>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::generate_tuple_for ( F && f,
sequence< ids... >  )
constexpr

◆ get_absolute_threshold()

template<typename ComputeDataType, typename OutDataType, typename AccDataType = ComputeDataType>
CK_TILE_HOST double ck_tile::get_absolute_threshold ( const double max_possible_num,
const int number_of_accumulations = 1 )

Calculate absolute error threshold for numerical comparisons.

Calculates the absolute error threshold based on the maximum possible value and the characteristics of the data types involved in the computation.

Template Parameters
ComputeDataTypeType used for computation
OutDataTypeType used for output
AccDataTypeType used for accumulation (defaults to ComputeDataType)
Parameters
max_possible_numMaximum possible value in the computation
number_of_accumulationsNumber of accumulation operations performed
Returns
Absolute error threshold based on data type characteristics and maximum value

◆ get_alibi_slopes()

template<typename DataType>
CK_TILE_HOST std::vector< DataType > ck_tile::get_alibi_slopes ( ck_tile::index_t nheads)

◆ get_async_store_smem_info()

template<typename LdsTileWindow_>
CK_TILE_DEVICE auto ck_tile::get_async_store_smem_info ( LdsTileWindow_ && lds_tile)

◆ get_block_1d_id()

CK_TILE_DEVICE index_t ck_tile::get_block_1d_id ( )

◆ get_block_id()

CK_TILE_DEVICE index_t ck_tile::get_block_id ( )

◆ get_block_size()

CK_TILE_DEVICE index_t ck_tile::get_block_size ( )

◆ get_container_subset() [1/2]

template<typename T, index_t N, index_t... Is>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::get_container_subset ( const array< T, N > & arr,
sequence< Is... >  )
constexpr

◆ get_container_subset() [2/2]

template<typename... Ts, index_t... Is>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::get_container_subset ( const tuple< Ts... > & tup,
sequence< Is... >  )
constexpr

◆ get_default_stride()

template<bool is_row_major>
auto ck_tile::get_default_stride ( std::size_t row,
std::size_t col,
std::size_t stride,
bool_constant< is_row_major >  )

◆ get_device_name()

std::string ck_tile::get_device_name ( )
inline

◆ get_grid_size()

CK_TILE_DEVICE index_t ck_tile::get_grid_size ( )

◆ get_lane_id()

CK_TILE_DEVICE index_t ck_tile::get_lane_id ( )

◆ get_relative_threshold()

template<typename ComputeDataType, typename OutDataType, typename AccDataType = ComputeDataType>
CK_TILE_HOST double ck_tile::get_relative_threshold ( const int number_of_accumulations = 1)

Calculate relative error threshold for numerical comparisons.

Calculates the relative error threshold based on the mantissa bits and characteristics of the data types involved in the computation.

Template Parameters
ComputeDataTypeType used for computation
OutDataTypeType used for output
AccDataTypeType used for accumulation (defaults to ComputeDataType)
Parameters
number_of_accumulationsNumber of accumulation operations performed
Returns
Relative error threshold based on data type characteristics

◆ get_slice_tile() [1/2]

template<typename DataType_, typename StaticTileDistribution_, index_t... SliceBegins, index_t... SliceEnds>
CK_TILE_DEVICE constexpr auto ck_tile::get_slice_tile ( const static_distributed_tensor< DataType_, StaticTileDistribution_ > & tile,
sequence< SliceBegins... > slice_begins,
sequence< SliceEnds... > slice_ends )
constexpr

◆ get_slice_tile() [2/2]

template<typename BottomTensorView_, typename WindowLengths_, index_t... SliceBegins, index_t... SliceEnds>
CK_TILE_DEVICE constexpr auto ck_tile::get_slice_tile ( const tile_window_with_static_lengths< BottomTensorView_, WindowLengths_ > & tile,
sequence< SliceBegins... > slice_begins,
sequence< SliceEnds... > slice_ends )
constexpr

◆ get_smem_capacity()

CK_TILE_HOST_DEVICE constexpr index_t ck_tile::get_smem_capacity ( )
constexpr

◆ get_thread_global_1d_id()

CK_TILE_DEVICE index_t ck_tile::get_thread_global_1d_id ( )

◆ get_thread_id()

CK_TILE_DEVICE index_t ck_tile::get_thread_id ( )

◆ get_thread_local_1d_id()

CK_TILE_DEVICE index_t ck_tile::get_thread_local_1d_id ( )

◆ get_warp_id()

template<bool ReturnSgpr = true>
CK_TILE_DEVICE index_t ck_tile::get_warp_id ( bool_constant< ReturnSgpr > = {})

◆ get_warp_size()

CK_TILE_HOST_DEVICE constexpr index_t ck_tile::get_warp_size ( )
constexpr

◆ get_x_indices_from_distributed_indices()

template<typename StaticTileDistribution, typename DistributedIndices>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::get_x_indices_from_distributed_indices ( StaticTileDistribution tile_distribution,
DistributedIndices distributed_indices )
constexpr

◆ get_y_unpacks_from_x_unpacks()

template<typename YLengths, index_t XUnpacks>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::get_y_unpacks_from_x_unpacks ( YLengths ,
number< XUnpacks >  )
constexpr

◆ getConvSpecializationString()

CK_TILE_HOST std::string ck_tile::getConvSpecializationString ( const ConvolutionSpecialization & s)

◆ getSize() [1/6]

template<std::size_t N>
std::size_t ck_tile::getSize ( char(&) [N])
inlinenodiscardconstexprnoexcept

◆ getSize() [2/6]

std::size_t ck_tile::getSize ( const char & )
inlinenodiscardconstexprnoexcept

◆ getSize() [3/6]

std::size_t ck_tile::getSize ( const char * s)
inlinenodiscardconstexprnoexcept

◆ getSize() [4/6]

template<std::size_t N>
std::size_t ck_tile::getSize ( const char(&)[N])
inlinenodiscardconstexprnoexcept

◆ getSize() [5/6]

std::size_t ck_tile::getSize ( const std::string & s)
inlinenodiscardnoexcept

◆ getSize() [6/6]

std::size_t ck_tile::getSize ( const std::string_view & s)
inlinenodiscardconstexprnoexcept

◆ hip_check_error()

CK_TILE_HOST void ck_tile::hip_check_error ( hipError_t x)

◆ histogram_sorted_sequence()

template<typename SeqSortedSamples, index_t r, index_t... rs>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::histogram_sorted_sequence ( SeqSortedSamples ,
sequence< r, rs... >  )
constexpr

◆ host_tensor_descriptor()

template<bool is_row_major>
auto ck_tile::host_tensor_descriptor ( std::size_t row,
std::size_t col,
std::size_t stride,
bool_constant< is_row_major >  )

Creates a host tensor descriptor with specified dimensions and layout.

Constructs a HostTensorDescriptor with appropriate strides based on whether the tensor layout is row-major or column-major. This is determined via the compile-time template parameter is_row_major.

Template Parameters
is_row_majorCompile-time flag indicating if the layout is row-major (true) or column-major (false)
Parameters
rowNumber of rows in the tensor
colNumber of columns in the tensor
strideStride between adjacent rows (for row-major) or columns (for column-major)
Returns
HostTensorDescriptor with shape {row, col} and strides:
  • For row-major: {stride, 1}
  • For column-major: {1, stride}

◆ inclusive_scan_sequence()

template<typename Seq, typename Reduce, index_t Init>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::inclusive_scan_sequence ( Seq ,
Reduce ,
number< Init >  )
constexpr

◆ InputTileDistributionEncoding()

template<typename InnerEncode, index_t kLeadIterPerWarp, index_t kSecondIterPerWarp, index_t kLeadNumWarps, index_t kSecondNumWarps>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::InputTileDistributionEncoding ( )
constexpr

◆ int8_to_float()

CK_TILE_HOST_DEVICE constexpr float ck_tile::int8_to_float ( const int8_t & x)
constexpr

◆ integer_divide_ceil()

template<typename X, typename Y>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::integer_divide_ceil ( X x,
Y y )
constexpr

◆ integer_divide_floor()

template<typename X, typename Y>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::integer_divide_floor ( X x,
Y y )
constexpr

◆ integer_least_multiple()

template<typename X, typename Y>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::integer_least_multiple ( X x,
Y y )
constexpr

◆ integer_log2_floor()

CK_TILE_HOST_DEVICE constexpr int32_t ck_tile::integer_log2_floor ( int32_t x)
constexpr

◆ is_gfx11_supported()

bool ck_tile::is_gfx11_supported ( )
inline

◆ is_gfx12_supported()

bool ck_tile::is_gfx12_supported ( )
inline

◆ is_load_tr_supported()

bool ck_tile::is_load_tr_supported ( )
inline

◆ is_nested_tuple()

template<typename... Ts>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::is_nested_tuple ( const tuple< Ts... > & )
constexpr

◆ is_null_tile_window()

template<typename T>
CK_TILE_DEVICE constexpr auto ck_tile::is_null_tile_window ( const T & )
constexpr

◆ is_power_of_two_integer()

CK_TILE_HOST_DEVICE constexpr bool ck_tile::is_power_of_two_integer ( int32_t x)
constexpr

◆ is_wave32()

CK_TILE_HOST bool ck_tile::is_wave32 ( )

◆ isnan() [1/8]

CK_TILE_HOST_DEVICE bool ck_tile::isnan ( const bf8_t & x)

◆ isnan() [2/8]

CK_TILE_HOST_DEVICE bool ck_tile::isnan ( const bfloat16_t & x)

◆ isnan() [3/8]

CK_TILE_HOST_DEVICE bool ck_tile::isnan ( const fp8_t & x)

◆ isnan() [4/8]

CK_TILE_DEVICE bool ck_tile::isnan ( double x)

◆ isnan() [5/8]

CK_TILE_DEVICE bool ck_tile::isnan ( float x)

◆ isnan() [6/8]

CK_TILE_DEVICE bool ck_tile::isnan ( fp16_t x)

◆ isnan() [7/8]

CK_TILE_DEVICE bool ck_tile::isnan ( int32_t x)

◆ isnan() [8/8]

CK_TILE_DEVICE bool ck_tile::isnan ( int8_t x)

◆ kentry() [1/2]

template<typename Arch, int MinBlockPerCu, typename Kernel, typename... Args>
__global__ void ck_tile::kentry ( Args... args)

◆ kentry() [2/2]

template<int MinBlockPerCu, typename Kernel, typename... Args>
__global__ void ck_tile::kentry ( Args... args)

◆ launch_and_check()

template<typename... Callables>
CK_TILE_HOST void ck_tile::launch_and_check ( const stream_config & sc,
Callables &&... callables )

◆ launch_kernel()

template<typename... Callables>
CK_TILE_HOST float ck_tile::launch_kernel ( const stream_config & s,
Callables &&... callables )

◆ launch_kernel_time_mask()

template<typename PreprocessFunc, typename... Callables>
CK_TILE_HOST float ck_tile::launch_kernel_time_mask ( const stream_config & s,
PreprocessFunc preprocess,
Callables &&... callables )

◆ lcm() [1/2]

template<typename X, typename Y>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::lcm ( X x,
Y y )
constexpr

◆ lcm() [2/2]

template<typename X, typename... Ys, typename std::enable_if< sizeof...(Ys) > = 2, bool, ::type = false>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::lcm ( X x,
Ys... ys )
constexpr

◆ lds_load_fence()

CK_TILE_DEVICE void ck_tile::lds_load_fence ( index_t cnt = 0)

◆ less()

__host__ __device__ ck_tile::less ( ) ->less< void, void >

FIXME: create macro to replace 'host device' and nothing more.

◆ less_equal()

__host__ __device__ ck_tile::less_equal ( ) ->less_equal< void, void >

FIXME: create macro to replace 'host device' and nothing more.

◆ llvm_amdgcn_raw_buffer_atomic_add_bf16x2()

CK_TILE_DEVICE_EXTERN bf16x2_t ck_tile::llvm_amdgcn_raw_buffer_atomic_add_bf16x2 ( bf16x2_t vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_atomic_add_fp16x2()

CK_TILE_DEVICE_EXTERN fp16x2_t ck_tile::llvm_amdgcn_raw_buffer_atomic_add_fp16x2 ( fp16x2_t vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_atomic_add_fp32()

CK_TILE_DEVICE_EXTERN float ck_tile::llvm_amdgcn_raw_buffer_atomic_add_fp32 ( float vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_atomic_add_i32()

CK_TILE_DEVICE_EXTERN int32_t ck_tile::llvm_amdgcn_raw_buffer_atomic_add_i32 ( int32_t vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_atomic_max_fp64()

CK_TILE_DEVICE_EXTERN double ck_tile::llvm_amdgcn_raw_buffer_atomic_max_fp64 ( double vdata,
int32x4_t rsrc,
int voffset,
int soffset,
int glc_slc )

◆ llvm_amdgcn_raw_buffer_load_fp16()

CK_TILE_DEVICE_EXTERN _Float16 ck_tile::llvm_amdgcn_raw_buffer_load_fp16 ( int32x4_t srsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_load_fp16x2()

CK_TILE_DEVICE_EXTERN fp16x2_t ck_tile::llvm_amdgcn_raw_buffer_load_fp16x2 ( int32x4_t srsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_load_fp16x4()

CK_TILE_DEVICE_EXTERN fp16x4_t ck_tile::llvm_amdgcn_raw_buffer_load_fp16x4 ( int32x4_t srsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_load_fp32()

CK_TILE_DEVICE_EXTERN float ck_tile::llvm_amdgcn_raw_buffer_load_fp32 ( int32x4_t srsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_load_fp32x2()

CK_TILE_DEVICE_EXTERN fp32x2_t ck_tile::llvm_amdgcn_raw_buffer_load_fp32x2 ( int32x4_t srsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_load_fp32x4()

CK_TILE_DEVICE_EXTERN fp32x4_t ck_tile::llvm_amdgcn_raw_buffer_load_fp32x4 ( int32x4_t srsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_load_i16()

CK_TILE_DEVICE_EXTERN int16_t ck_tile::llvm_amdgcn_raw_buffer_load_i16 ( int32x4_t srsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_load_i16x2()

CK_TILE_DEVICE_EXTERN int16x2_t ck_tile::llvm_amdgcn_raw_buffer_load_i16x2 ( int32x4_t srsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_load_i16x4()

CK_TILE_DEVICE_EXTERN int16x4_t ck_tile::llvm_amdgcn_raw_buffer_load_i16x4 ( int32x4_t srsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_load_i32()

CK_TILE_DEVICE_EXTERN int32_t ck_tile::llvm_amdgcn_raw_buffer_load_i32 ( int32x4_t srsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_load_i32x2()

CK_TILE_DEVICE_EXTERN int32x2_t ck_tile::llvm_amdgcn_raw_buffer_load_i32x2 ( int32x4_t srsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_load_i32x4()

CK_TILE_DEVICE_EXTERN int32x4_t ck_tile::llvm_amdgcn_raw_buffer_load_i32x4 ( int32x4_t srsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_load_i8()

CK_TILE_DEVICE_EXTERN int8_t ck_tile::llvm_amdgcn_raw_buffer_load_i8 ( int32x4_t srsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_load_i8x2()

CK_TILE_DEVICE_EXTERN int8x2_t ck_tile::llvm_amdgcn_raw_buffer_load_i8x2 ( int32x4_t srsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_load_i8x4()

CK_TILE_DEVICE_EXTERN int8x4_t ck_tile::llvm_amdgcn_raw_buffer_load_i8x4 ( int32x4_t srsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_load_lds()

CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_load_lds ( int32x4_t rsrc,
as3_uint32_ptr lds_ptr,
index_t size,
index_t voffset,
index_t soffset,
index_t offset,
index_t aux )

◆ llvm_amdgcn_raw_buffer_store_fp16()

CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_fp16 ( _Float16 vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_store_fp16x2()

CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_fp16x2 ( fp16x2_t vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_store_fp16x4()

CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_fp16x4 ( fp16x4_t vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_store_fp32()

CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_fp32 ( float vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_store_fp32x2()

CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_fp32x2 ( fp32x2_t vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_store_fp32x4()

CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_fp32x4 ( fp32x4_t vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_store_i16()

CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_i16 ( int16_t vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_store_i16x2()

CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_i16x2 ( int16x2_t vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_store_i16x4()

CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_i16x4 ( int16x4_t vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_store_i32()

CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_i32 ( int32_t vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_store_i32x2()

CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_i32x2 ( int32x2_t vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_store_i32x4()

CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_i32x4 ( int32x4_t vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_store_i8()

CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_i8 ( int8_t vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_store_i8x2()

CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_i8x2 ( int8x2_t vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_store_i8x4()

CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_i8x4 ( int8x4_t vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_store_ui16()

CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_ui16 ( uint16_t vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_store_ui16x2()

CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_ui16x2 ( uint16x2_t vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_store_ui16x4()

CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_ui16x4 ( uint16x4_t vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ load_int4_tile()

template<typename BDataType, typename ComputeDataType, index_t UnaryOpSize, typename WarpTile, typename WarpWindow>
CK_TILE_DEVICE void ck_tile::load_int4_tile ( WarpTile & dst,
const WarpWindow & src )

◆ load_tile() [1/3]

template<typename WindowLengths>
CK_TILE_DEVICE auto ck_tile::load_tile ( const null_tile_window< WindowLengths > & )

◆ load_tile() [2/3]

template<typename TileWindow_, index_t i_access = -1, bool oob_conditional_check = true>
CK_TILE_DEVICE auto ck_tile::load_tile ( const TileWindow_ & tile_window,
number< i_access > = {},
bool_constant< oob_conditional_check > = {} )

◆ load_tile() [3/3]

template<typename DistributedTensor_, typename TileWindow_, index_t i_access = -1, bool oob_conditional_check = true>
CK_TILE_DEVICE auto ck_tile::load_tile ( DistributedTensor_ & dst_tile,
const TileWindow_ & tile_window,
number< i_access > = {},
bool_constant< oob_conditional_check > = {} )

◆ load_tile_raw() [1/3]

template<typename T, typename WindowLengths>
CK_TILE_DEVICE auto ck_tile::load_tile_raw ( T & ,
const null_tile_window< WindowLengths > &  )

◆ load_tile_raw() [2/3]

template<typename T, typename BottomTensorView_, typename WindowLengths_, typename TileDistribution_, typename LinearBottomDims_, index_t i_access = -1, bool oob_conditional_check = true, bool pre_nop = false>
CK_TILE_DEVICE auto ck_tile::load_tile_raw ( T & tile,
const tile_window_linear< BottomTensorView_, WindowLengths_, TileDistribution_, LinearBottomDims_ > & tile_window,
number< i_access > = {},
bool_constant< oob_conditional_check > = {},
bool_constant< pre_nop > = {} )

◆ load_tile_raw() [3/3]

template<typename T, typename BottomTensorView_, typename WindowLengths_, typename TileDistribution_, index_t NumCoord, index_t i_access = -1, bool oob_conditional_check = true, bool pre_nop = false>
CK_TILE_DEVICE auto ck_tile::load_tile_raw ( T & tile,
const tile_window_with_static_distribution< BottomTensorView_, WindowLengths_, TileDistribution_, NumCoord > & tile_window,
number< i_access > = {},
bool_constant< oob_conditional_check > = {},
bool_constant< pre_nop > = {} )

Loads a tile of data using inline assembly.

Note
Bare in mind that loading data this way, you have to manually initialize your thread buffer and synchronize load afterwards in order to make sure it's done before using loaded data from registers
See also
tile_window_with_static_distribution::init_raw() and buffer_view.hpp
buffer_load_fence()

◆ load_tile_transpose()

template<typename BottomTensorView_, typename WindowLengths_, typename TileDistribution_, index_t NumCoord, typename Policy = DefaultTranspose<typename BottomTensorView_::DataType>, typename = std::enable_if_t<TransposeTileDistrChecker<TileDistribution_, typename BottomTensorView_::DataType, Policy>::distr_encoding_valid, Policy>>
CK_TILE_DEVICE auto ck_tile::load_tile_transpose ( const tile_window_with_static_distribution< BottomTensorView_, WindowLengths_, TileDistribution_, NumCoord > & tile_window)

transpose loads tile from a tensor and returns the resulting tensor with a new (transposed) tile distribution. use SFINAE to ensure the tile distribution encoding is valid.

This function is intended for use with statically distributed tensor tiles, where the input and output tile distributions differ due to the transpose operation. It ensures that the element space size and vector length remain consistent between the input and output distributions.

Template Parameters
BottomTensorView_The type of the bottom tensor view.
WindowLengths_The type representing the window lengths.
TileDistribution_The type representing the tile distribution.
NumCoordThe number of coordinates (dimensions).
PolicyThe transpose policy to use (defaults to DefaultTranspose). the last is SFINAE to ensure the tile distribution encoding is valid.
Parameters
tile_windowThe tile window with static distribution to load and transpose.
Returns
A statically distributed tensor containing the transposed tile data.
Note
  • The function uses compile-time checks to ensure the input and output tile distributions are compatible in terms of element space size and vector length.
  • The transpose operation is performed according to the specified Policy.

◆ load_tile_with_elementwise()

template<typename TileWindow_, typename ElementWise_, index_t i_access = -1, bool oob_conditional_check = true>
CK_TILE_DEVICE auto ck_tile::load_tile_with_elementwise ( const TileWindow_ & tile_window,
ElementWise_ elementwise,
number< i_access > = {},
bool_constant< oob_conditional_check > = {} )

Load tile with elementwise function.

Note
This function is a modification of the existing load function. It has been extended with two additional parameters: it takes a tuple as input and an elementwise function. For each A = A0, A1… AN, the elementwise function is additionally applied during a single read.

◆ log() [1/3]

◆ log() [2/3]

template<typename T>
CK_TILE_DEVICE T ck_tile::log ( T x)

◆ log() [3/3]

template<typename T>
CK_TILE_HOST T ck_tile::log ( T x)

◆ log< double >() [1/2]

template<>
CK_TILE_DEVICE double ck_tile::log< double > ( double x)

◆ log< double >() [2/2]

template<>
CK_TILE_HOST double ck_tile::log< double > ( double x)

◆ log< float >() [1/2]

template<>
CK_TILE_DEVICE float ck_tile::log< float > ( float x)

◆ log< float >() [2/2]

template<>
CK_TILE_HOST float ck_tile::log< float > ( float x)

◆ log< fp16_t >()

◆ LogRange()

template<typename Range>
CK_TILE_HOST std::ostream & ck_tile::LogRange ( std::ostream & os,
Range && range,
std::string delim,
int precision = std::cout.precision(),
int width = 0 )

◆ LogRangeAsType()

template<typename T, typename Range>
CK_TILE_HOST std::ostream & ck_tile::LogRangeAsType ( std::ostream & os,
Range && range,
std::string delim,
int precision = std::cout.precision(),
int width = 0 )

◆ m0_inc_with_memory()

CK_TILE_DEVICE void ck_tile::m0_inc_with_memory ( index_t v)

◆ m0_set_with_memory()

CK_TILE_DEVICE void ck_tile::m0_set_with_memory ( index_t v)

◆ make_alibi_from_lr_mask()

template<typename DataType, bool RowMajor = true, unsigned LogMaxSadOprndSize = 16>
CK_TILE_HOST_DEVICE auto ck_tile::make_alibi_from_lr_mask ( DataType slope,
index_t window_left_size,
index_t window_right_size,
index_t y_total,
index_t x_total,
GenericAttentionMaskEnum mask_enum )

◆ make_array()

template<typename D = void, typename... Ts>
CK_TILE_HOST_DEVICE constexpr details::return_type< D, Ts... > ck_tile::make_array ( Ts &&... ts)
constexpr

◆ make_array_with()

template<typename T, index_t Size>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::make_array_with ( std::initializer_list< T > ilist)
constexpr

◆ make_buffer_view() [1/2]

template<address_space_enum BufferAddressSpace, amd_buffer_coherence_enum Coherence = amd_buffer_coherence_enum::coherence_default, typename T, typename BufferSizeType>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::make_buffer_view ( T *__restrict__ p,
BufferSizeType buffer_size )
constexpr

◆ make_buffer_view() [2/2]

template<address_space_enum BufferAddressSpace, amd_buffer_coherence_enum Coherence = amd_buffer_coherence_enum::coherence_default, typename T, typename BufferSizeType, typename X, typename std::enable_if< std::is_same< remove_cvref_t< T >, remove_cvref_t< X > >::value, bool >::type = false>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::make_buffer_view ( T *__restrict__ p,
BufferSizeType buffer_size,
X invalid_element_value )
constexpr

◆ make_cluster_descriptor()

template<typename Lengths, typename ArrangeOrder = typename arithmetic_sequence_gen<0, Lengths::size(), 1>::type>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::make_cluster_descriptor ( const Lengths & lengths,
ArrangeOrder order = typename arithmetic_sequence_gen<0, Lengths::size(), 1>::type{} )
constexpr

◆ make_embed_transform()

template<typename UpLengths, typename Coefficients, typename std::enable_if< UpLengths::size()==Coefficients::size(), bool >::type = false>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::make_embed_transform ( const UpLengths & up_lengths,
const Coefficients & coefficients )
constexpr

◆ make_freeze_transform()

template<typename LowerIndex>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::make_freeze_transform ( const LowerIndex & low_idx)
constexpr

◆ make_generic_attention_mask_coordinates_from_lr_window()

CK_TILE_HOST_DEVICE constexpr auto ck_tile::make_generic_attention_mask_coordinates_from_lr_window ( index_t left_size,
index_t right_size,
index_t y_total,
index_t x_total,
bool is_top_left = true )
constexpr

◆ make_generic_attention_mask_from_lr_window()

template<typename MaskType>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::make_generic_attention_mask_from_lr_window ( index_t left_size,
index_t right_size,
index_t y_total,
index_t x_total,
bool is_top_left = true )
constexpr

◆ make_indexing_transform()

template<typename UpLength, typename Indices>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::make_indexing_transform ( const UpLength & up_lengths,
const Indices & indices )
constexpr

◆ make_indexing_transform_with_adaptor()

template<typename UpLength, typename IndexingAdaptor>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::make_indexing_transform_with_adaptor ( const UpLength & up_lengths,
const IndexingAdaptor & iadaptor )
constexpr

◆ make_insert_transform()

template<typename UpperIndex>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::make_insert_transform ( const UpperIndex & up_idx)
constexpr

◆ make_kernel()

template<int MinBlockPerCu = CK_TILE_MIN_BLOCK_PER_CU, typename Arch = void, typename KernelImpl, typename... Args>
CK_TILE_HOST auto ck_tile::make_kernel ( KernelImpl ,
dim3 grid_dim,
dim3 block_dim,
std::size_t lds_byte,
Args... args )

◆ make_left_pad_transform()

template<typename LowLength, typename LeftPadLength, bool SkipIsValidCheck = false>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::make_left_pad_transform ( const LowLength & low_length,
const LeftPadLength & left_pad_,
bool_constant< SkipIsValidCheck > = bool_constant<false>{} )
constexpr

◆ make_merge_transform()

template<typename LowLengths>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::make_merge_transform ( const LowLengths & low_lengths)
constexpr

◆ make_merge_transform_v2_magic_division()

template<typename LowLengths>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::make_merge_transform_v2_magic_division ( const LowLengths & low_lengths)
constexpr

◆ make_merge_transform_v3_division_mod()

template<typename LowLengths>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::make_merge_transform_v3_division_mod ( const LowLengths & low_lengths)
constexpr

◆ make_modulo_transform()

template<typename Modulus, typename UpLength>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::make_modulo_transform ( const Modulus & modulus,
const UpLength & up_length )
constexpr

◆ make_multi_index()

template<typename... Xs>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::make_multi_index ( Xs &&... xs)
constexpr

◆ make_naive_tensor_descriptor()

template<typename... Lengths, typename... Strides, index_t GuaranteedLastDimensionVectorLength = -1, index_t GuaranteedLastDimensionVectorStride = -1, typename std::enable_if< sizeof...(Lengths)==sizeof...(Strides), bool >::type = false>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::make_naive_tensor_descriptor ( const tuple< Lengths... > & lengths,
const tuple< Strides... > & strides,
number< GuaranteedLastDimensionVectorLength > = number<-1>{},
number< GuaranteedLastDimensionVectorStride > = number<-1>{} )
constexpr

◆ make_naive_tensor_descriptor_aligned()

template<typename... Lengths, typename Align>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::make_naive_tensor_descriptor_aligned ( const tuple< Lengths... > & lengths,
Align align )
constexpr

◆ make_naive_tensor_descriptor_packed()

template<typename... Lengths, index_t GuaranteedLastDimensionVectorLength = -1>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::make_naive_tensor_descriptor_packed ( const tuple< Lengths... > & lengths,
number< GuaranteedLastDimensionVectorLength > = number<-1>{} )
constexpr

◆ make_naive_tensor_descriptor_packed_with_offset()

template<typename... Lengths, typename... Strides, typename Offset, index_t GuaranteedLastDimensionVectorLength = -1, typename std::enable_if< sizeof...(Lengths)==sizeof...(Strides), bool >::type = false>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::make_naive_tensor_descriptor_packed_with_offset ( const tuple< Lengths... > & lengths,
const Offset & offset,
number< GuaranteedLastDimensionVectorLength > = number<-1>{} )
constexpr

◆ make_naive_tensor_descriptor_with_offset()

template<typename... Lengths, typename... Strides, typename offset, index_t GuaranteedLastDimensionVectorLength = -1, index_t GuaranteedLastDimensionVectorStride = -1, typename std::enable_if< sizeof...(Lengths)==sizeof...(Strides), bool >::type = false>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::make_naive_tensor_descriptor_with_offset ( const tuple< Lengths... > & lengths,
const tuple< Strides... > & strides,
const offset & os,
number< GuaranteedLastDimensionVectorLength > = number<-1>{},
number< GuaranteedLastDimensionVectorStride > = number<-1>{} )
constexpr

◆ make_naive_tensor_view()

template<address_space_enum BufferAddressSpace = address_space_enum::generic, memory_operation_enum DstInMemOp = memory_operation_enum::set, amd_buffer_coherence_enum Coherence = amd_buffer_coherence_enum::coherence_default, typename DataType, typename... Lengths, typename... Strides, index_t GuaranteedLastDimensionVectorLength = -1, index_t GuaranteedLastDimensionVectorStride = -1, typename std::enable_if< sizeof...(Lengths)==sizeof...(Strides), bool >::type = false>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::make_naive_tensor_view ( DataType *__restrict__ p,
const tuple< Lengths... > & lengths,
const tuple< Strides... > & strides,
number< GuaranteedLastDimensionVectorLength > = number<-1>{},
number< GuaranteedLastDimensionVectorStride > = number<-1>{} )
constexpr

◆ make_naive_tensor_view_packed()

template<address_space_enum BufferAddressSpace = address_space_enum::generic, amd_buffer_coherence_enum Coherence = amd_buffer_coherence_enum::coherence_default, typename DataType, typename... Lengths, index_t GuaranteedLastDimensionVectorLength = -1>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::make_naive_tensor_view_packed ( DataType *__restrict__ p,
const tuple< Lengths... > & lengths,
number< GuaranteedLastDimensionVectorLength > = number<-1>{} )
constexpr

◆ make_null_tile_window()

template<typename WindowLengths>
CK_TILE_DEVICE constexpr auto ck_tile::make_null_tile_window ( const WindowLengths & window_lengths)
constexpr

◆ make_offset_transform()

template<typename LowLength, typename OffsetLength>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::make_offset_transform ( const LowLength & low_length,
const OffsetLength & offset_length )
constexpr

◆ make_pad_transform()

template<typename LowLength, typename LeftPad, typename RightPad, bool SkipIsValidCheck = false>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::make_pad_transform ( const LowLength & low_length,
const LeftPad & left_pad,
const RightPad & right_pad,
bool_constant< SkipIsValidCheck > = bool_constant<false>{} )
constexpr

◆ make_page_block_navigator() [1/2]

template<typename TensorView>
CK_TILE_HOST_DEVICE auto ck_tile::make_page_block_navigator ( const TensorView & tensor_view)

◆ make_page_block_navigator() [2/2]

template<typename DataType, index_t VirtualDim, typename TensorView>
CK_TILE_HOST_DEVICE auto ck_tile::make_page_block_navigator ( copy_const_t< DataType, void > * physical_blocks,
long_index_t block_stride,
long_index_t fixed_offset,
const int32_t * physical_block_indices,
index_t num_blocks,
index_t page_block_size,
const TensorView & complete_view,
const TensorView & last_view )

◆ make_ParallelTensorFunctor()

template<typename F, typename... Xs>
CK_TILE_HOST auto ck_tile::make_ParallelTensorFunctor ( F f,
Xs... xs )

◆ make_pass_through_transform()

template<typename LowLength>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::make_pass_through_transform ( const LowLength & low_length)
constexpr

◆ make_replicate_transform()

template<typename UpLengths>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::make_replicate_transform ( const UpLengths & up_lengths)
constexpr

◆ make_right_pad_transform()

template<typename LowLength, typename RightPadLength, bool SkipIsValidCheck = false>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::make_right_pad_transform ( const LowLength & low_length,
const RightPadLength & right_pad_,
bool_constant< SkipIsValidCheck > = bool_constant<false>{} )
constexpr

◆ make_sequence()

template<index_t... Is>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::make_sequence ( number< Is > ...)
constexpr

◆ make_single_stage_tensor_adaptor()

template<typename Transforms, typename LowerDimensionOldTopIdss, typename UpperDimensionNewTopIdss>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::make_single_stage_tensor_adaptor ( const Transforms & transforms,
LowerDimensionOldTopIdss ,
UpperDimensionNewTopIdss  )
constexpr

◆ make_slice_transform()

template<typename LowLength, typename SliceBegin, typename SliceEnd>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::make_slice_transform ( const LowLength & low_length,
const SliceBegin & slice_begin,
const SliceEnd & slice_end )
constexpr

◆ make_static_distributed_tensor() [1/2]

template<typename DataType, typename StaticTileDistribution>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::make_static_distributed_tensor ( const StaticTileDistribution & )
constexpr

◆ make_static_distributed_tensor() [2/2]

template<typename DataType, typename StaticTileDistribution, typename ThreadBuffer>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::make_static_distributed_tensor ( const StaticTileDistribution & ,
ThreadBuffer && thread_buffer_ )
constexpr

◆ make_static_tile_distribution()

template<typename StaticTileDistributionEncoding_>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::make_static_tile_distribution ( StaticTileDistributionEncoding_ )
constexpr

◆ make_tensor_adaptor_coordinate()

template<typename Adaptor, typename TopIndex>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::make_tensor_adaptor_coordinate ( const Adaptor & adaptor,
const TopIndex & idx_top )
constexpr

◆ make_tensor_coordinate()

template<typename TensorDesc, typename TopIndex>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::make_tensor_coordinate ( const TensorDesc & tensor_desc,
const TopIndex & idx_top )
constexpr

◆ make_tensor_descriptor_from_adaptor()

template<typename Adaptor, typename ElementSpaceSize>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::make_tensor_descriptor_from_adaptor ( const Adaptor & adaptor,
const ElementSpaceSize & element_space_size )
constexpr

◆ make_tensor_view()

template<address_space_enum BufferAddressSpace = address_space_enum::generic, memory_operation_enum DstInMemOp = memory_operation_enum::set, amd_buffer_coherence_enum Coherence = amd_buffer_coherence_enum::coherence_default, typename DataType, typename... Ts>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::make_tensor_view ( DataType *__restrict__ p,
const tensor_descriptor< Ts... > & desc )
constexpr

◆ make_thread_buffer()

template<typename... Ts>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::make_thread_buffer ( Ts &&... ts)
constexpr

◆ make_tile_scatter_gather() [1/6]

template<typename TensorView_, typename WindowLengths_, typename StaticTileDistribution_, typename StaticPageIndexArray_, typename StaticValidArray_, index_t HsGatherDim = 0, index_t NumCoord = 1>
CK_TILE_DEVICE constexpr auto ck_tile::make_tile_scatter_gather ( const TensorView_ & tensor_view,
const WindowLengths_ & window_lengths,
const multi_index< TensorView_::get_num_of_dimension()> & origin,
const StaticTileDistribution_ & tile_distribution,
const StaticPageIndexArray_ & page_idx,
const StaticValidArray_ & valids,
number< HsGatherDim > = {},
number< NumCoord > = {} )
constexpr

◆ make_tile_scatter_gather() [2/6]

template<typename TensorView_, typename WindowLengths_, typename StaticTileDistribution_, typename StaticPageIndexArray_, index_t HsGatherDim = 0, index_t NumCoord = 1>
CK_TILE_DEVICE constexpr auto ck_tile::make_tile_scatter_gather ( const TensorView_ & tensor_view,
const WindowLengths_ & window_lengths,
const multi_index< TensorView_::get_num_of_dimension()> & origin,
const StaticTileDistribution_ & tile_distribution,
const StaticPageIndexArray_ & page_idx,
number< HsGatherDim > = {},
number< NumCoord > = {} )
constexpr

◆ make_tile_scatter_gather() [3/6]

template<typename TensorView, typename WindowLengths, typename StaticTileDistribution, typename StaticPageIndexArray, typename StaticValidArray, index_t HsGatherDim>
CK_TILE_DEVICE constexpr auto ck_tile::make_tile_scatter_gather ( const tile_window_with_static_lengths< TensorView, WindowLengths > & tile_window,
const multi_index< TensorView::get_num_of_dimension()> & origin,
const StaticTileDistribution & tile_distribution,
const StaticPageIndexArray & page_idx,
const StaticValidArray & valids,
number< HsGatherDim > = {} )
constexpr

◆ make_tile_scatter_gather() [4/6]

template<typename TensorView, typename WindowLengths, typename StaticTileDistribution, typename StaticPageIndexArray, index_t HsGatherDim>
CK_TILE_DEVICE constexpr auto ck_tile::make_tile_scatter_gather ( const tile_window_with_static_lengths< TensorView, WindowLengths > & tile_window,
const multi_index< TensorView::get_num_of_dimension()> & origin,
const StaticTileDistribution & tile_distribution,
const StaticPageIndexArray & page_idx,
number< HsGatherDim > = {} )
constexpr

◆ make_tile_scatter_gather() [5/6]

template<typename TensorView, typename WindowLengths, typename StaticTileDistribution, typename StaticPageIndexArray, typename StaticValidArray, index_t HsGatherDim>
CK_TILE_DEVICE constexpr auto ck_tile::make_tile_scatter_gather ( const tile_window_with_static_lengths< TensorView, WindowLengths > & tile_window,
const StaticTileDistribution & tile_distribution,
const StaticPageIndexArray & page_idx,
const StaticValidArray & valids,
number< HsGatherDim > = {} )
constexpr

◆ make_tile_scatter_gather() [6/6]

template<typename TensorView, typename WindowLengths, typename StaticTileDistribution, typename StaticPageIndexArray, index_t HsGatherDim>
CK_TILE_DEVICE constexpr auto ck_tile::make_tile_scatter_gather ( const tile_window_with_static_lengths< TensorView, WindowLengths > & tile_window,
const StaticTileDistribution & tile_distribution,
const StaticPageIndexArray & page_idx,
number< HsGatherDim > = {} )
constexpr

◆ make_tile_window() [1/7]

template<typename WindowLengths, typename StaticTileDistribution>
CK_TILE_DEVICE constexpr auto ck_tile::make_tile_window ( const null_tile_window< WindowLengths > & t,
const StaticTileDistribution &  )
constexpr

◆ make_tile_window() [2/7]

template<typename TensorView_, typename WindowLengths_>
CK_TILE_DEVICE constexpr auto ck_tile::make_tile_window ( const TensorView_ & tensor_view,
const WindowLengths_ & window_lengths,
const multi_index< TensorView_::get_num_of_dimension()> & origin )
constexpr

◆ make_tile_window() [3/7]

template<typename TensorView_, typename WindowLengths_, typename StaticTileDistribution_, index_t NumCoord = 1>
CK_TILE_DEVICE constexpr auto ck_tile::make_tile_window ( const TensorView_ & tensor_view,
const WindowLengths_ & window_lengths,
const multi_index< TensorView_::get_num_of_dimension()> & origin,
const StaticTileDistribution_ & tile_distribution,
number< NumCoord > = {} )
constexpr

◆ make_tile_window() [4/7]

template<typename TensorView, typename WindowLengths>
CK_TILE_DEVICE constexpr auto ck_tile::make_tile_window ( const tile_window_with_static_lengths< TensorView, WindowLengths > & tile_window,
const multi_index< TensorView::get_num_of_dimension()> & origin )
constexpr

◆ make_tile_window() [5/7]

template<typename TensorView, typename WindowLengths, typename StaticTileDistribution>
CK_TILE_DEVICE constexpr auto ck_tile::make_tile_window ( const tile_window_with_static_lengths< TensorView, WindowLengths > & tile_window,
const multi_index< TensorView::get_num_of_dimension()> & origin,
const StaticTileDistribution & tile_distribution )
constexpr

◆ make_tile_window() [6/7]

template<typename TensorView, typename WindowLengths, typename StaticTileDistribution>
CK_TILE_DEVICE constexpr auto ck_tile::make_tile_window ( const tile_window_with_static_lengths< TensorView, WindowLengths > & tile_window,
const StaticTileDistribution & tile_distribution )
constexpr

◆ make_tile_window() [7/7]

template<typename WindowLengths, typename... Ts>
CK_TILE_DEVICE constexpr auto ck_tile::make_tile_window ( null_tensor_view ,
const WindowLengths & window_lengths,
const multi_index< WindowLengths::size()> & ,
Ts && ... )
constexpr

◆ make_tile_window_linear() [1/2]

template<typename TensorView_, typename WindowLengths_, typename StaticTileDistribution_, typename LinearBottomDims_ = default_linear_bottom_dims<TensorView_>>
CK_TILE_DEVICE constexpr auto ck_tile::make_tile_window_linear ( const TensorView_ & tensor_view,
const WindowLengths_ & window_lengths,
const multi_index< TensorView_::get_num_of_dimension()> & origin,
const StaticTileDistribution_ & tile_distribution,
LinearBottomDims_ = {} )
constexpr

◆ make_tile_window_linear() [2/2]

template<typename TileWindow_, typename StaticTileDistribution_, typename LinearBottomDims_ = default_linear_bottom_dims<typename TileWindow_::BottomTensorView>>
CK_TILE_DEVICE constexpr auto ck_tile::make_tile_window_linear ( const TileWindow_ & tile_window,
const StaticTileDistribution_ & tile_distribution,
LinearBottomDims_ = {} )
constexpr

◆ make_tile_window_linear_raw() [1/2]

template<typename TensorView_, typename WindowLengths_, typename StaticTileDistribution_, typename LinearBottomDims_ = default_linear_bottom_dims<TensorView_>>
CK_TILE_DEVICE auto ck_tile::make_tile_window_linear_raw ( const TensorView_ & tensor_view,
const WindowLengths_ & window_lengths,
const multi_index< TensorView_::get_num_of_dimension()> & origin,
const StaticTileDistribution_ & tile_distribution,
LinearBottomDims_ = {} )

◆ make_tile_window_linear_raw() [2/2]

template<typename TileWindow_, typename StaticTileDistribution_, typename LinearBottomDims_ = default_linear_bottom_dims<typename TileWindow_::BottomTensorView>>
CK_TILE_DEVICE constexpr auto ck_tile::make_tile_window_linear_raw ( const TileWindow_ & tile_window,
const StaticTileDistribution_ & tile_distribution,
LinearBottomDims_ = {} )
constexpr

◆ make_tile_window_raw() [1/2]

template<typename TensorView_, typename WindowLengths_, typename StaticTileDistribution_, index_t NumCoord = 1>
CK_TILE_DEVICE auto ck_tile::make_tile_window_raw ( const TensorView_ & tensor_view,
const WindowLengths_ & window_lengths,
const multi_index< TensorView_::get_num_of_dimension()> & origin,
const StaticTileDistribution_ & tile_distribution,
number< NumCoord > = {} )

◆ make_tile_window_raw() [2/2]

template<typename TensorView, typename WindowLengths, typename StaticTileDistribution>
CK_TILE_DEVICE constexpr auto ck_tile::make_tile_window_raw ( const tile_window_with_static_lengths< TensorView, WindowLengths > & tile_window,
const StaticTileDistribution & tile_distribution )
constexpr

◆ make_transposed_distr_encode()

template<typename T, index_t LaneGroupSize, index_t kOuterDistDim0, index_t kOuterDistDim1, index_t kInnerDistDim0, index_t kInnerDistDim1>
CK_TILE_DEVICE constexpr auto ck_tile::make_transposed_distr_encode ( )
constexpr

◆ make_tuple()

template<typename... Xs>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::make_tuple ( Xs &&... xs)
constexpr

◆ make_unmerge_transform()

template<typename UpLengths, bool Use24BitIntegerCalculation = false>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::make_unmerge_transform ( const UpLengths & up_lengths,
bool_constant< Use24BitIntegerCalculation > = bool_constant<false>{} )
constexpr

◆ make_wave_buffer_resource()

template<typename ForceSGPR = std::false_type>
CK_TILE_DEVICE int32x4_t ck_tile::make_wave_buffer_resource ( const void * ptr,
uint32_t size = 0xffffffff,
ForceSGPR = {} )

◆ make_xor_transform()

template<typename LowLengths>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::make_xor_transform ( const LowLengths & low_lengths)
constexpr

◆ make_zero_multi_index()

template<index_t NSize>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::make_zero_multi_index ( )
constexpr

◆ makeTuple()

template<size_t... Idx>
std::tuple< std::integral_constant< size_t, Idx >... > ck_tile::makeTuple ( std::index_sequence< Idx... > )
constexprnoexcept

◆ max() [1/8]

template<>
CK_TILE_DEVICE constexpr double ck_tile::max ( double x,
double y )
constexpr

◆ max() [2/8]

template<>
CK_TILE_DEVICE constexpr float ck_tile::max ( float x,
float y )
constexpr

◆ max() [3/8]

template<index_t Y>
CK_TILE_HOST_DEVICE constexpr index_t ck_tile::max ( index_t x,
number< Y >  )
constexpr

◆ max() [4/8]

template<index_t X>
CK_TILE_HOST_DEVICE constexpr index_t ck_tile::max ( number< X > ,
index_t y )
constexpr

◆ max() [5/8]

template<typename T>
CK_TILE_HOST_DEVICE constexpr T ck_tile::max ( T x)
constexpr

◆ max() [6/8]

template<typename T>
CK_TILE_DEVICE constexpr T ck_tile::max ( T x,
T y )
constexpr

◆ max() [7/8]

template<typename T>
CK_TILE_HOST constexpr T ck_tile::max ( T x,
T y )
constexpr

◆ max() [8/8]

template<typename X, typename... Ys>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::max ( X x,
Ys... ys )
constexpr

◆ mem_op_string()

template<memory_operation_enum MemOp_>
std::string ck_tile::mem_op_string ( )

◆ merge_sequences()

template<typename... Seqs>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::merge_sequences ( Seqs... )
constexpr

◆ min() [1/8]

template<>
CK_TILE_DEVICE constexpr double ck_tile::min ( double x,
double y )
constexpr

◆ min() [2/8]

template<>
CK_TILE_DEVICE constexpr float ck_tile::min ( float x,
float y )
constexpr

◆ min() [3/8]

template<index_t Y>
CK_TILE_HOST_DEVICE constexpr index_t ck_tile::min ( index_t x,
number< Y >  )
constexpr

◆ min() [4/8]

template<index_t X>
CK_TILE_HOST_DEVICE constexpr index_t ck_tile::min ( number< X > ,
index_t y )
constexpr

◆ min() [5/8]

template<typename T>
CK_TILE_HOST_DEVICE constexpr T ck_tile::min ( T x)
constexpr

◆ min() [6/8]

template<typename T>
CK_TILE_DEVICE constexpr T ck_tile::min ( T x,
T y )
constexpr

◆ min() [7/8]

template<typename T>
CK_TILE_HOST constexpr T ck_tile::min ( T x,
T y )
constexpr

◆ min() [8/8]

template<typename X, typename... Ys>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::min ( X x,
Ys... ys )
constexpr

◆ minus()

__host__ __device__ ck_tile::minus ( ) ->minus< void, void >

FIXME: create macro to replace 'host device' and nothing more.

◆ modify_sequence_elements_by_ids()

template<typename Seq, typename Values, typename Ids>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::modify_sequence_elements_by_ids ( Seq ,
Values ,
Ids  )
constexpr

◆ moe_gemm_kernel()

template<typename ADataType, typename BDataType, typename AccDataType, typename CDataType, typename LayoutA, typename LayoutB, typename LayoutC, int MoeGemmKind = 0, typename ActivationOp = identity>
__global__ void ck_tile::moe_gemm_kernel ( const ck_tile::index_t * p_sorted_token_ids_,
const ck_tile::index_t * p_sorted_expert_ids_,
const ck_tile::index_t * p_max_token_id_,
const ADataType * A,
const BDataType * B,
CDataType * C,
const AccDataType * expert_weight_ptr,
ck_tile::index_t Num_tokens,
ck_tile::index_t TokensPerBlock,
ck_tile::index_t TopK,
ck_tile::index_t M,
ck_tile::index_t N,
ck_tile::index_t K,
ck_tile::index_t strideA,
ck_tile::index_t strideB,
ck_tile::index_t strideC,
index_t scale_granularity_m,
index_t scale_granularity_n,
index_t scale_granularity_k,
float * scale_A_ptr,
float * scale_B_ptr,
float * expert_bias_ptr )

◆ moe_sorting_get_smem_row_col()

CK_TILE_HOST constexpr auto ck_tile::moe_sorting_get_smem_row_col ( int tokens_,
int num_experts_ )
constexpr

◆ moe_sorting_get_sub_token()

CK_TILE_HOST index_t ck_tile::moe_sorting_get_sub_token ( int tokens_,
int num_experts_ )

◆ moe_sorting_get_workspace_size()

CK_TILE_HOST index_t ck_tile::moe_sorting_get_workspace_size ( int tokens_,
int num_experts_,
int topk_,
int dispatch_policy_ )

◆ moe_sorting_is_oneshot()

CK_TILE_HOST bool ck_tile::moe_sorting_is_oneshot ( int tokens_,
int num_experts_ )

◆ moe_sorting_mp_get_workspace_size()

CK_TILE_HOST index_t ck_tile::moe_sorting_mp_get_workspace_size ( int tokens_,
int num_experts_,
int topk_ )

◆ move_tensor_adaptor_coordinate() [1/2]

template<bool JudgeDoTransforms = true, typename Adaptor, typename AdaptorCoord, typename TopIndex>
CK_TILE_HOST_DEVICE constexpr void ck_tile::move_tensor_adaptor_coordinate ( const Adaptor & adaptor,
AdaptorCoord & coord,
const TopIndex & idx_diff_top )
constexpr

◆ move_tensor_adaptor_coordinate() [2/2]

template<bool JudgeDoTransforms = true, typename Adaptor, typename AdaptorCoord, typename TopIndex, typename BottomIndex>
CK_TILE_HOST_DEVICE constexpr void ck_tile::move_tensor_adaptor_coordinate ( const Adaptor & adaptor,
AdaptorCoord & coord,
const TopIndex & idx_diff_top,
BottomIndex & idx_diff_bottom )
constexpr

◆ move_tensor_coordinate()

template<bool JudgeDoTransforms = true, typename TensorDesc, typename TensorCoord, typename Index>
CK_TILE_HOST_DEVICE constexpr void ck_tile::move_tensor_coordinate ( const TensorDesc & tensor_desc,
TensorCoord & coord,
const Index & coord_step )
constexpr

◆ move_tile_window() [1/7]

template<typename WindowLengths>
CK_TILE_DEVICE void ck_tile::move_tile_window ( null_tile_window< WindowLengths > & ,
const typename null_tile_window< WindowLengths >::BottomTensorIndex &  )

◆ move_tile_window() [2/7]

template<typename TensorView_, typename WindowLengths_, typename StaticTileDistribution_, typename LinearBottomDims_>
CK_TILE_DEVICE void ck_tile::move_tile_window ( tile_window_linear< TensorView_, WindowLengths_, StaticTileDistribution_, LinearBottomDims_ > & window,
const typename tile_window_linear< TensorView_, WindowLengths_, StaticTileDistribution_, LinearBottomDims_ >::BottomTensorIndex & step )

◆ move_tile_window() [3/7]

template<typename TensorView_, typename WindowLengths_, typename StaticTileDistribution_, index_t NumCoord>
CK_TILE_DEVICE void ck_tile::move_tile_window ( tile_window_with_static_distribution< TensorView_, WindowLengths_, StaticTileDistribution_, NumCoord > & window,
const typename tile_window_with_static_distribution< TensorView_, WindowLengths_, StaticTileDistribution_, NumCoord >::BottomTensorIndex & step )

◆ move_tile_window() [4/7]

template<typename TensorView_, typename WindowLengths_>
CK_TILE_DEVICE void ck_tile::move_tile_window ( tile_window_with_static_lengths< TensorView_, WindowLengths_ > & window,
const typename tile_window_with_static_lengths< TensorView_, WindowLengths_ >::BottomTensorIndex & step )

◆ move_tile_window() [5/7]

template<typename TileWindow_>
CK_TILE_DEVICE void ck_tile::move_tile_window ( TileWindow_ & window,
const typename TileWindow_::BottomTensorIndex & step )

◆ move_tile_window() [6/7]

template<typename TileWindowWithStaticDistributionType, typename StepType, typename std::enable_if_t< is_detected< is_tuple, TileWindowWithStaticDistributionType >::value > * = nullptr>
CK_TILE_DEVICE void ck_tile::move_tile_window ( TileWindowWithStaticDistributionType & window,
StepType & step )

◆ move_tile_window() [7/7]

template<typename TensorView_, typename WindowLengths_, typename StaticTileDistribution_, index_t NumCoord>
CK_TILE_DEVICE void ck_tile::move_tile_window ( tuple< tile_window_with_static_distribution< TensorView_, WindowLengths_, StaticTileDistribution_, NumCoord > > & window,
const typename tile_window_with_static_distribution< TensorView_, WindowLengths_, StaticTileDistribution_, NumCoord >::BottomTensorIndex & step )

◆ multiplies()

__host__ __device__ ck_tile::multiplies ( ) ->multiplies< void, void >

FIXME: create macro to replace 'host device' and nothing more.

◆ naive_attention_fwd()

◆ naive_gemm_kernel()

template<typename ADataType, typename BDataType, typename AccDataType, typename CDataType, typename LayoutA, typename LayoutB, typename LayoutC>
__global__ void ck_tile::naive_gemm_kernel ( ADataType * A,
BDataType * B,
CDataType * C,
ck_tile::index_t M,
ck_tile::index_t N,
ck_tile::index_t K,
ck_tile::index_t strideA,
ck_tile::index_t strideB,
ck_tile::index_t strideC )

◆ neg() [1/2]

template<typename T>
CK_TILE_DEVICE T ck_tile::neg ( T x)

◆ neg() [2/2]

template<typename T>
CK_TILE_HOST T ck_tile::neg ( T x)

◆ neg< double >() [1/2]

template<>
CK_TILE_DEVICE double ck_tile::neg< double > ( double x)

◆ neg< double >() [2/2]

template<>
CK_TILE_HOST double ck_tile::neg< double > ( double x)

◆ neg< float >() [1/2]

template<>
CK_TILE_DEVICE float ck_tile::neg< float > ( float x)

◆ neg< float >() [2/2]

template<>
CK_TILE_HOST float ck_tile::neg< float > ( float x)

◆ neg< fp16_t >()

◆ neg< int32_t >() [1/2]

◆ neg< int32_t >() [2/2]

◆ neg< int8_t >() [1/2]

◆ neg< int8_t >() [2/2]

◆ next_power_of_two() [1/3]

template<index_t X>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::next_power_of_two ( )
constexpr

◆ next_power_of_two() [2/3]

CK_TILE_HOST_DEVICE constexpr int32_t ck_tile::next_power_of_two ( int32_t x)
constexpr

◆ next_power_of_two() [3/3]

template<index_t X>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::next_power_of_two ( number< X > )
constexpr

◆ operator!=() [1/3]

template<typename T, index_t Size>
CK_TILE_HOST_DEVICE constexpr bool ck_tile::operator!= ( const array< T, Size > & a,
const array< T, Size > & b )
constexpr

◆ operator!=() [2/3]

template<typename... Xs>
CK_TILE_HOST_DEVICE constexpr bool ck_tile::operator!= ( const tuple< Xs... > & a,
const tuple< Xs... > & b )
constexpr

◆ operator!=() [3/3]

template<index_t... Xs, index_t... Ys>
CK_TILE_HOST_DEVICE constexpr bool ck_tile::operator!= ( sequence< Xs... > x,
sequence< Ys... > y )
constexpr

◆ operator%() [1/3]

template<index_t Y, index_t... Xs>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::operator% ( number< Y > ,
sequence< Xs... >  )
constexpr

◆ operator%() [2/3]

template<index_t... Xs, index_t Y>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::operator% ( sequence< Xs... > ,
number< Y >  )
constexpr

◆ operator%() [3/3]

template<index_t... Xs, index_t... Ys>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::operator% ( sequence< Xs... > ,
sequence< Ys... >  )
constexpr

◆ operator*() [1/10]

template<index_t NSize, typename T>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::operator* ( const multi_index< NSize > & a,
const T & b )
constexpr

◆ operator*() [2/10]

template<index_t NSize>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::operator* ( const multi_index< NSize > & x,
index_t a )
constexpr

◆ operator*() [3/10]

template<typename... Xs, typename... Ys>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::operator* ( const tuple< Xs... > & x,
const tuple< Ys... > & y )
constexpr

◆ operator*() [4/10]

template<typename... Xs, typename Y, std::enable_if_t<!std::is_integral< Y >::value &&!std::is_floating_point< Y >::value, bool > = false>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::operator* ( const tuple< Xs... > & x,
const Y & y )
constexpr

◆ operator*() [5/10]

template<typename... Xs, typename Y, std::enable_if_t< std::is_integral< Y >::value||std::is_floating_point< Y >::value, bool > = false>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::operator* ( const tuple< Xs... > & x,
Y a )
constexpr

◆ operator*() [6/10]

template<index_t NSize>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::operator* ( index_t a,
const multi_index< NSize > & x )
constexpr

◆ operator*() [7/10]

template<index_t Y, index_t... Xs>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::operator* ( number< Y > ,
sequence< Xs... >  )
constexpr

◆ operator*() [8/10]

template<index_t... Xs, index_t Y>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::operator* ( sequence< Xs... > ,
number< Y >  )
constexpr

◆ operator*() [9/10]

template<index_t... Xs, index_t... Ys>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::operator* ( sequence< Xs... > ,
sequence< Ys... >  )
constexpr

◆ operator*() [10/10]

template<typename... Xs, typename Y, std::enable_if_t< std::is_integral< Y >::value||std::is_floating_point< Y >::value, bool > = false>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::operator* ( Y a,
const tuple< Xs... > & x )
constexpr

◆ operator+() [1/6]

template<index_t NSize, typename T>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::operator+ ( const multi_index< NSize > & a,
const T & b )
constexpr

◆ operator+() [2/6]

template<typename... Xs, typename... Ys>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::operator+ ( const tuple< Xs... > & x,
const tuple< Ys... > & y )
constexpr

◆ operator+() [3/6]

template<typename... Xs, typename Y, std::enable_if_t<!std::is_integral< Y >::value &&!std::is_floating_point< Y >::value, bool > = false>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::operator+ ( const tuple< Xs... > & x,
const Y & y )
constexpr

◆ operator+() [4/6]

template<index_t Y, index_t... Xs>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::operator+ ( number< Y > ,
sequence< Xs... >  )
constexpr

◆ operator+() [5/6]

template<index_t... Xs, index_t Y>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::operator+ ( sequence< Xs... > ,
number< Y >  )
constexpr

◆ operator+() [6/6]

template<index_t... Xs, index_t... Ys>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::operator+ ( sequence< Xs... > ,
sequence< Ys... >  )
constexpr

◆ operator+=() [1/2]

template<index_t NSize, typename X>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::operator+= ( multi_index< NSize > & y,
const X & x )
constexpr

◆ operator+=() [2/2]

template<typename... Ys, typename X, std::enable_if_t<!std::is_integral< X >::value &&!std::is_floating_point< X >::value, bool > = false>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::operator+= ( tuple< Ys... > & y,
const X & x )
constexpr

◆ operator-() [1/6]

template<index_t NSize, typename T>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::operator- ( const multi_index< NSize > & a,
const T & b )
constexpr

◆ operator-() [2/6]

template<typename... Xs, typename... Ys>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::operator- ( const tuple< Xs... > & x,
const tuple< Ys... > & y )
constexpr

◆ operator-() [3/6]

template<typename... Xs, typename Y, std::enable_if_t<!std::is_integral< Y >::value &&!std::is_floating_point< Y >::value, bool > = false>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::operator- ( const tuple< Xs... > & x,
const Y & y )
constexpr

◆ operator-() [4/6]

template<index_t Y, index_t... Xs>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::operator- ( number< Y > ,
sequence< Xs... >  )
constexpr

◆ operator-() [5/6]

template<index_t... Xs, index_t Y>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::operator- ( sequence< Xs... > ,
number< Y >  )
constexpr

◆ operator-() [6/6]

template<index_t... Xs, index_t... Ys>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::operator- ( sequence< Xs... > ,
sequence< Ys... >  )
constexpr

◆ operator-=() [1/2]

template<index_t NSize, typename X>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::operator-= ( multi_index< NSize > & y,
const X & x )
constexpr

◆ operator-=() [2/2]

template<typename... Ys, typename X, std::enable_if_t<!std::is_integral< X >::value &&!std::is_floating_point< X >::value, bool > = false>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::operator-= ( tuple< Ys... > & y,
const X & x )
constexpr

◆ operator/() [1/4]

template<typename... Xs, typename... Ys>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::operator/ ( const tuple< Xs... > & x,
const tuple< Ys... > & y )
constexpr

◆ operator/() [2/4]

template<index_t Y, index_t... Xs>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::operator/ ( number< Y > ,
sequence< Xs... >  )
constexpr

◆ operator/() [3/4]

template<index_t... Xs, index_t Y>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::operator/ ( sequence< Xs... > ,
number< Y >  )
constexpr

◆ operator/() [4/4]

template<index_t... Xs, index_t... Ys>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::operator/ ( sequence< Xs... > ,
sequence< Ys... >  )
constexpr

◆ operator<<()

template<typename T>
std::ostream & ck_tile::operator<< ( std::ostream & os,
const std::vector< T > & v )

Stream operator overload for vector output.

Provides a formatted string representation of a vector, useful for debugging and logging.

Template Parameters
TType of vector elements
Parameters
osOutput stream
vVector to output
Returns
Reference to the output stream

◆ operator==() [1/3]

template<typename T, index_t Size>
CK_TILE_HOST_DEVICE constexpr bool ck_tile::operator== ( const array< T, Size > & a,
const array< T, Size > & b )
constexpr

◆ operator==() [2/3]

template<typename... Xs>
CK_TILE_HOST_DEVICE constexpr bool ck_tile::operator== ( const tuple< Xs... > & a,
const tuple< Xs... > & b )
constexpr

◆ operator==() [3/3]

template<index_t... Xs, index_t... Ys>
CK_TILE_HOST_DEVICE constexpr bool ck_tile::operator== ( sequence< Xs... > ,
sequence< Ys... >  )
constexpr

◆ pad_tensor_view()

template<typename TensorView, typename TileLengths, typename DoPads>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::pad_tensor_view ( const TensorView & tensor_view,
const TileLengths & tile_lengths,
DoPads  )
constexpr

◆ permute_vectors_i4x4_b()

template<typename Tensor>
void ck_tile::permute_vectors_i4x4_b ( Tensor & tensor)

Permute packed int4 vectors for device implementation compatibility.

This function transforms 4 pk_int4_t values from original layout to hardware-optimized layout:

Each pk_int4_t contains two 4-bit values packed in the high and low nibbles of an int8_t

Example:

  • Input: 0x76, 0x54, 0x32, 0x10
  • Output: 0x75, 0x31, 0x64, 0x20
Note
Input tensor length must be a multiple of 4

This transformation is required before transferring B matrix data (of type pk_int4_t) to device. The device conversion functions (i4_to_half4, i4_to_bhalf4, amd_assembly_i4_to_fp8x8, amd_assembly_i4_to_bf8x8) require data in 0x75316420 order to correctly convert pk_int4_t to other numeric types.

◆ pick_sequence_elements_by_ids()

template<typename Seq, index_t... Is>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::pick_sequence_elements_by_ids ( Seq ,
sequence< Is... >  )
constexpr

◆ pick_sequence_elements_by_mask()

template<typename Seq, typename Mask>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::pick_sequence_elements_by_mask ( Seq ,
Mask  )
constexpr

◆ pk_add_f16()

CK_TILE_DEVICE fp16x2_t ck_tile::pk_add_f16 ( const fp16x2_t & x,
const fp16x2_t & y )

◆ pk_fp4_to_bf16()

CK_TILE_HOST_DEVICE constexpr bf16_t ck_tile::pk_fp4_to_bf16 ( const pk_fp4_t & x,
float scale )
constexpr

◆ pk_fp4_to_bf16x2()

CK_TILE_HOST_DEVICE constexpr bf16x2_t ck_tile::pk_fp4_to_bf16x2 ( const pk_fp4_t & x,
float scale )
constexpr

◆ pk_fp4_to_float()

CK_TILE_HOST_DEVICE constexpr float ck_tile::pk_fp4_to_float ( const pk_fp4_t & x,
float scale )
constexpr

◆ pk_fp4_to_fp16()

CK_TILE_HOST_DEVICE constexpr fp16_t ck_tile::pk_fp4_to_fp16 ( const pk_fp4_t & x,
float scale )
constexpr

◆ pk_fp4_to_fp16x2()

CK_TILE_HOST_DEVICE constexpr fp16x2_t ck_tile::pk_fp4_to_fp16x2 ( const pk_fp4_t & x,
float scale )
constexpr

◆ pk_fp4_to_fp32x2()

CK_TILE_HOST_DEVICE constexpr fp32x2_t ck_tile::pk_fp4_to_fp32x2 ( const pk_fp4_t & x,
float scale )
constexpr

◆ pk_int4_t_to_bfloat16x2_t()

CK_TILE_HOST_DEVICE bf16x2_t ck_tile::pk_int4_t_to_bfloat16x2_t ( const pk_int4_t & x)

◆ pk_int4_t_to_fp32x2_t()

CK_TILE_HOST_DEVICE fp32x2_t ck_tile::pk_int4_t_to_fp32x2_t ( const pk_int4_t & x)

◆ pk_int4_t_to_fp32x2_t_signed_conversion()

CK_TILE_HOST_DEVICE fp32x2_t ck_tile::pk_int4_t_to_fp32x2_t_signed_conversion ( const pk_int4_t & x)

◆ pk_int4_t_to_halfx2_t()

CK_TILE_HOST_DEVICE fp16x2_t ck_tile::pk_int4_t_to_halfx2_t ( const pk_int4_t & x)

◆ pk_int4_t_to_int8x2_t()

CK_TILE_HOST_DEVICE int8x2_t ck_tile::pk_int4_t_to_int8x2_t ( const pk_int4_t & x)

◆ plus()

__host__ __device__ ck_tile::plus ( ) ->plus< void, void >

FIXME: create macro to replace 'host device' and nothing more.

◆ pow() [1/2]

template<typename T>
CK_TILE_DEVICE T ck_tile::pow ( T x,
T gamma )

◆ pow() [2/2]

template<typename T>
CK_TILE_HOST T ck_tile::pow ( T x,
T gamma )

◆ pow< double >() [1/2]

template<>
CK_TILE_DEVICE double ck_tile::pow< double > ( double x,
double gamma )

◆ pow< double >() [2/2]

template<>
CK_TILE_HOST double ck_tile::pow< double > ( double x,
double gamma )

◆ pow< float >() [1/2]

template<>
CK_TILE_DEVICE float ck_tile::pow< float > ( float x,
float gamma )

◆ pow< float >() [2/2]

template<>
CK_TILE_HOST float ck_tile::pow< float > ( float x,
float gamma )

◆ prefix_sum_sequence()

template<typename Seq>
auto ck_tile::prefix_sum_sequence ( Seq )
constexpr

◆ preprocess_profiling_impl()

template<typename TimerType, typename PreprocessFunc>
CK_TILE_HOST double ck_tile::preprocess_profiling_impl ( TimerType timer,
const stream_config & s,
PreprocessFunc preprocess )

◆ print() [1/14]

template<address_space_enum BufferAddressSpace, typename T, typename BufferSizeType, bool InvalidElementUseNumericalZeroValue, amd_buffer_coherence_enum Coherence>
CK_TILE_HOST_DEVICE void ck_tile::print ( const buffer_view< BufferAddressSpace, T, BufferSizeType, InvalidElementUseNumericalZeroValue, Coherence > & bv)

◆ print() [2/14]

template<>
CK_TILE_HOST_DEVICE void ck_tile::print ( const char & value)

Specialization for char.

◆ print() [3/14]

template<>
CK_TILE_HOST_DEVICE void ck_tile::print ( const double & value)

Specialization for double.

◆ print() [4/14]

template<>
CK_TILE_HOST_DEVICE void ck_tile::print ( const float & value)

Specialization for float.

◆ print() [5/14]

template<>
CK_TILE_HOST_DEVICE void ck_tile::print ( const int & value)

Specialization for int.

◆ print() [6/14]

template<>
CK_TILE_HOST_DEVICE void ck_tile::print ( const long & value)

Specialization for long.

◆ print() [7/14]

template<typename T>
CK_TILE_HOST_DEVICE void ck_tile::print ( const T & )

Declare a ck_tile::print() interface that gets specialized in each header file for types that can be printed.

◆ print() [8/14]

template<typename T, size_t N>
CK_TILE_HOST_DEVICE void ck_tile::print ( const T(&) value[N])

Specialization for array.

◆ print() [9/14]

template<typename PsYs2XsAdaptor_, typename Ys2DDescriptor_, typename StaticTileDistributionEncoding_, typename TileDistributionDetail_>
CK_TILE_HOST_DEVICE void ck_tile::print ( const tile_distribution< PsYs2XsAdaptor_, Ys2DDescriptor_, StaticTileDistributionEncoding_, TileDistributionDetail_ > & distribution)

◆ print() [10/14]

template<typename RsLengths_, typename HsLengthss_, typename Ps2RHssMajor_, typename Ps2RHssMinor_, typename Ys2RHsMajor_, typename Ys2RHsMinor_>
CK_TILE_HOST_DEVICE void ck_tile::print ( const tile_distribution_encoding< RsLengths_, HsLengthss_, Ps2RHssMajor_, Ps2RHssMinor_, Ys2RHsMajor_, Ys2RHsMinor_ > & encoding)

◆ print() [11/14]

template<index_t BlockSize, index_t YPerTile, index_t XPerTile, index_t VecSize, tile_distribution_pattern DistributionPattern, index_t NumWaveGroups>
CK_TILE_HOST_DEVICE void ck_tile::print ( const tile_distribution_encoding_pattern_2d< BlockSize, YPerTile, XPerTile, VecSize, DistributionPattern, NumWaveGroups > & )

◆ print() [12/14]

template<typename... T>
CK_TILE_HOST_DEVICE void ck_tile::print ( const tuple< T... > & t)

◆ print() [13/14]

template<typename RsLengths_, typename HsLengthss_, typename Ps2RHssMajor_, typename Ps2RHssMinor_, typename Ys2RHsMajor_, typename Ys2RHsMinor_>
CK_TILE_HOST_DEVICE void ck_tile::print ( const typename tile_distribution_encoding< RsLengths_, HsLengthss_, Ps2RHssMajor_, Ps2RHssMinor_, Ys2RHsMajor_, Ys2RHsMinor_ >::detail & detail_obj)

◆ print() [14/14]

template<>
CK_TILE_HOST_DEVICE void ck_tile::print ( const unsigned int & value)

Specialization for unsigned int.

◆ quant_type_to_string()

std::string ck_tile::quant_type_to_string ( QuantType quant_type)
inline

◆ rcp() [1/2]

template<typename T>
CK_TILE_DEVICE T ck_tile::rcp ( T x)

◆ rcp() [2/2]

template<typename T>
CK_TILE_HOST T ck_tile::rcp ( T x)

◆ reduce_on_sequence()

template<typename Seq, typename Reduce, index_t Init>
CK_TILE_HOST_DEVICE constexpr index_t ck_tile::reduce_on_sequence ( Seq ,
Reduce f,
number< Init >  )
constexpr

◆ reference_batched_dropout()

template<typename DataType, typename RandValOutputDataType>
CK_TILE_HOST void ck_tile::reference_batched_dropout ( HostTensor< DataType > & in_out_b_m_n,
const HostTensor< RandValOutputDataType > & randval_b_m_n,
const uint8_t & p_undrop_in_uint8_t,
const float scale )

◆ reference_batched_dropout_randval()

template<typename RandValOutputDataType>
CK_TILE_HOST void ck_tile::reference_batched_dropout_randval ( HostTensor< RandValOutputDataType > & randval_b_m_n,
index_t batch,
uint64_t drop_seed,
uint64_t drop_offset )

◆ reference_batched_elementwise()

template<typename ADataType, typename BDataType, typename AccDataType, typename CDataType, typename AElementOp = ck_tile::identity, typename BElementOp = ck_tile::identity, typename BinaryElementOp = ck_tile::plus<AccDataType>>
CK_TILE_HOST void ck_tile::reference_batched_elementwise ( const HostTensor< ADataType > & a_b_m_n,
const HostTensor< BDataType > & b_b_m_n,
HostTensor< CDataType > & c_b_m_n,
const AElementOp & a_element_op = {},
const BElementOp & b_element_op = {},
const BinaryElementOp & binary_element_op = {} )

◆ reference_batched_gemm()

template<typename ADataType, typename BDataType, typename AccDataType, typename CDataType, typename AElementOp = ck_tile::identity, typename BElementOp = ck_tile::identity, typename ACCElementOp = ck_tile::identity>
CK_TILE_HOST void ck_tile::reference_batched_gemm ( const HostTensor< ADataType > & a_b_m_k,
const HostTensor< BDataType > & b_b_n_k,
HostTensor< CDataType > & c_b_m_n,
const AElementOp & a_element_op = {},
const BElementOp & b_element_op = {},
const ACCElementOp & acc_element_op = {} )

◆ reference_batched_gemm_gpu()

template<typename ADataType, typename BDataType, typename AccDataType, typename CDataType, typename LayoutA, typename LayoutB, typename LayoutC>
void ck_tile::reference_batched_gemm_gpu ( ADataType * a_ptr,
BDataType * b_ptr,
CDataType * c_ptr,
index_t M,
index_t N,
index_t K,
index_t stride_a,
index_t stride_b,
index_t stride_c,
index_t batch_stride_A,
index_t batch_stride_B,
index_t batch_stride_C,
index_t batch_count )

◆ reference_batched_masking()

template<typename CDataType, typename MaskingType>
CK_TILE_HOST void ck_tile::reference_batched_masking ( HostTensor< CDataType > & c_b_m_n,
const MaskingType & mask )

◆ reference_batched_rotary_position_embedding()

template<typename DataType, typename ComputeDataType = float>
CK_TILE_HOST void ck_tile::reference_batched_rotary_position_embedding ( const HostTensor< DataType > & input_bsd,
const HostTensor< DataType > & cos_sd,
const HostTensor< DataType > & sin_sd,
bool interleaved,
HostTensor< DataType > & output_bsd,
bool use_1_row_sin_cos = false )

◆ reference_batched_softmax()

template<typename ADataType, typename CompDataType, typename BDataType, typename CompElementOp = ck_tile::identity>
CK_TILE_HOST void ck_tile::reference_batched_softmax ( const HostTensor< ADataType > & a_b_m_n,
HostTensor< BDataType > & b_b_m_n,
const CompElementOp & comp_element_op = {},
std::optional< std::reference_wrapper< HostTensor< CompDataType > > > lse_b_m = std::nullopt )

◆ reference_batched_transpose()

template<typename Type>
CK_TILE_HOST void ck_tile::reference_batched_transpose ( const HostTensor< Type > & x,
HostTensor< Type > & y,
std::string layout_in = "NCHW",
std::string layout_out = "NHWC" )

◆ reference_binary_elementwise()

template<typename ADataType, typename BDataType, typename CDataType, typename ComputeDataType, typename ElementOp>
CK_TILE_HOST void ck_tile::reference_binary_elementwise ( const HostTensor< ADataType > & a,
const HostTensor< BDataType > & b,
HostTensor< CDataType > & c,
ElementOp element_op )

◆ reference_blockwise_gemm_gpu()

template<typename ADataType, typename BDataType, typename AccDataType, typename CDataType, typename LayoutA, typename LayoutB, typename LayoutC>
void ck_tile::reference_blockwise_gemm_gpu ( ADataType * a_ptr,
BDataType * b_ptr,
CDataType * c_ptr,
index_t M,
index_t N,
index_t K,
index_t stride_a,
index_t stride_b,
index_t stride_c,
index_t scale_granularity_m,
index_t scale_granularity_n,
index_t scale_granularity_k,
float * scale_A_ptr,
float * scale_B_ptr )

◆ reference_fused_moe()

template<typename AccDataType, typename Activation, typename ADataType, typename GDataType, typename DDataType, typename ODataType, typename AScaleDataType, typename GScaleDataType, typename DScaleDataType, typename YSmoothScaleDataType, typename TopkWeightDataType, typename IndexDataType>
void ck_tile::reference_fused_moe ( const ck_tile::HostTensor< ADataType > & a_host,
const ck_tile::HostTensor< GDataType > & g_host,
const ck_tile::HostTensor< DDataType > & d_host,
const ck_tile::HostTensor< AScaleDataType > & sa_host,
const ck_tile::HostTensor< GScaleDataType > & sg_host,
const ck_tile::HostTensor< DScaleDataType > & sd_host,
const ck_tile::HostTensor< YSmoothScaleDataType > & sy_host,
ck_tile::HostTensor< ODataType > & o_host,
const ck_tile::HostTensor< IndexDataType > & sorted_token_ids_host,
const ck_tile::HostTensor< TopkWeightDataType > & sorted_weight_host,
const ck_tile::HostTensor< IndexDataType > & sorted_expert_ids_host,
const ck_tile::HostTensor< IndexDataType > & num_sorted_tiles_host,
const ck_tile::HostTensor< IndexDataType > & token_ids_host,
ck_tile::index_t block_m,
ck_tile::index_t tokens,
ck_tile::index_t experts,
ck_tile::index_t hidden_size,
ck_tile::index_t intermediate_size,
ck_tile::index_t topk,
ck_tile::index_t gate_only )

◆ reference_gemm()

template<typename ADataType, typename BDataType, typename AccDataType, typename CDataType, typename AElementOp = ck_tile::identity, typename BElementOp = ck_tile::identity, typename ACCElementOp = ck_tile::identity>
CK_TILE_HOST void ck_tile::reference_gemm ( const HostTensor< ADataType > & a_m_k,
const HostTensor< BDataType > & b_k_n,
HostTensor< CDataType > & c_m_n,
const AElementOp & a_element_op = {},
const BElementOp & b_element_op = {},
const ACCElementOp & acc_element_op = {} )

◆ reference_gemm_gpu()

template<typename ADataType, typename BDataType, typename AccDataType, typename CDataType, typename LayoutA, typename LayoutB, typename LayoutC>
void ck_tile::reference_gemm_gpu ( ADataType * a_ptr,
BDataType * b_ptr,
CDataType * c_ptr,
index_t M,
index_t N,
index_t K,
index_t stride_a,
index_t stride_b,
index_t stride_c )

◆ reference_gemm_multiple_abd()

template<typename AsDataType, typename BsDataType, typename DsDataType, typename AccDataType, typename CDataType, typename AElementOp, typename BElementOp, typename CDElementOp, typename ADataType = remove_cvref_t<std::tuple_element_t<0, AsDataType>>, typename BDataType = remove_cvref_t<std::tuple_element_t<0, BsDataType>>, typename DDataType = remove_cvref_t<std::tuple_element_t<0, DsDataType>>>
CK_TILE_HOST void ck_tile::reference_gemm_multiple_abd ( const std::array< HostTensor< ADataType >, AsDataType::size()> & as_m_k,
const std::array< HostTensor< BDataType >, BsDataType::size()> & bs_k_n,
const std::array< HostTensor< DDataType >, DsDataType::size()> & ds_m_n,
HostTensor< ADataType > & a_m_k,
HostTensor< BDataType > & b_k_n,
HostTensor< CDataType > & c_m_n,
const AElementOp & a_element_op = {},
const BElementOp & b_element_op = {},
const CDElementOp & acc_element_op = {} )

◆ reference_gemm_multiple_d()

template<typename ADataType, typename BDataType, typename DsDataType, typename AccDataType, typename CDataType, typename ACCElementOp, typename DDataType = remove_cvref_t<std::tuple_element_t<0, DsDataType>>>
CK_TILE_HOST void ck_tile::reference_gemm_multiple_d ( const HostTensor< ADataType > & a_m_k,
const HostTensor< BDataType > & b_k_n,
const std::array< HostTensor< DDataType >, DsDataType::size()> & ds_m_n,
HostTensor< CDataType > & c_m_n,
const ACCElementOp & acc_element_op = {} )

◆ reference_gemm_quant()

template<typename ADataType, typename QDataType, typename BDataType, typename AccDataType, typename CDataType, typename QuantGroupSize, bool aquant, typename AElementOp = ck_tile::identity, typename BElementOp = ck_tile::identity, typename ACCElementOp = ck_tile::identity>
CK_TILE_HOST void ck_tile::reference_gemm_quant ( const HostTensor< ADataType > & a_m_k,
const HostTensor< QDataType > & q,
const HostTensor< BDataType > & b_k_n,
HostTensor< CDataType > & c_m_n,
const AElementOp & a_element_op = {},
const BElementOp & b_element_op = {},
const ACCElementOp & acc_element_op = {} )

◆ reference_gemm_rowcol_quant()

template<typename ADataType, typename AQDataType, typename BDataType, typename BQDataType, typename AccDataType, typename CDataType, typename AElementOp = ck_tile::identity, typename BElementOp = ck_tile::identity, typename ACCElementOp = ck_tile::identity>
CK_TILE_HOST void ck_tile::reference_gemm_rowcol_quant ( const HostTensor< ADataType > & a_m_k,
const HostTensor< AQDataType > & aq_m_1,
const HostTensor< BDataType > & b_k_n,
const HostTensor< BQDataType > & bq_1_n,
HostTensor< CDataType > & c_m_n,
const AElementOp & a_element_op = {},
const BElementOp & b_element_op = {},
const ACCElementOp & acc_element_op = {} )

◆ reference_gemm_tensor_quant()

template<typename ADataType, typename AQDataType, typename BDataType, typename BQDataType, typename AccDataType, typename CDataType, typename AElementOp = ck_tile::identity, typename BElementOp = ck_tile::identity, typename ACCElementOp = ck_tile::identity>
CK_TILE_HOST void ck_tile::reference_gemm_tensor_quant ( const HostTensor< ADataType > & a_m_k,
const HostTensor< AQDataType > & aq_1_1,
const HostTensor< BDataType > & b_k_n,
const HostTensor< BQDataType > & bq_1_1,
HostTensor< CDataType > & c_m_n,
const AElementOp & a_element_op = {},
const BElementOp & b_element_op = {},
const ACCElementOp & acc_element_op = {} )

◆ reference_grouped_conv_bwd_data()

template<ck_tile::index_t NDimSpatial, typename InDataType, typename WeiDataType, typename OutDataType>
CK_TILE_HOST void ck_tile::reference_grouped_conv_bwd_data ( HostTensor< InDataType > & input,
const HostTensor< WeiDataType > & weight,
const HostTensor< OutDataType > & output,
std::vector< ck_tile::long_index_t > conv_strides,
std::vector< ck_tile::long_index_t > conv_dilations,
std::vector< ck_tile::long_index_t > in_left_pads,
std::vector< ck_tile::long_index_t >  )

◆ reference_grouped_conv_bwd_weight()

template<ck_tile::index_t NDimSpatial, typename InDataType, typename WeiDataType, typename OutDataType>
CK_TILE_HOST void ck_tile::reference_grouped_conv_bwd_weight ( const HostTensor< InDataType > & input,
HostTensor< WeiDataType > & weight,
const HostTensor< OutDataType > & output,
std::vector< ck_tile::long_index_t > conv_strides,
std::vector< ck_tile::long_index_t > conv_dilations,
std::vector< ck_tile::long_index_t > in_left_pads,
std::vector< ck_tile::long_index_t >  )

◆ reference_grouped_conv_fwd()

template<ck_tile::index_t NDimSpatial, typename InDataType, typename WeiDataType, typename OutDataType, typename Elfunc = ck_tile::element_wise::PassThrough, typename Tuple = ck_tile::tuple<>>
CK_TILE_HOST void ck_tile::reference_grouped_conv_fwd ( const HostTensor< InDataType > & input,
const HostTensor< WeiDataType > & weight,
HostTensor< OutDataType > & output,
std::vector< ck_tile::long_index_t > conv_strides,
std::vector< ck_tile::long_index_t > conv_dilations,
std::vector< ck_tile::long_index_t > in_left_pads,
std::vector< ck_tile::long_index_t > ,
Elfunc elfunc = Elfunc{},
Tuple ds = {} )

◆ reference_im2col()

template<typename InDataType, typename OutDataType, index_t NDimSpatial>
CK_TILE_HOST void ck_tile::reference_im2col ( const HostTensor< InDataType > & in_host,
HostTensor< OutDataType > & out_host,
const ck_tile::conv::ConvParam & conv_params )

◆ reference_layernorm2d_fwd()

template<typename XDataType, typename GammaDataType, typename BetaDataType, typename ComputeDataType, typename YDataType, typename MeanDataType, typename InvStdDataType, typename Epilogue = reference_layernorm2d_default_epilogue>
void ck_tile::reference_layernorm2d_fwd ( const HostTensor< XDataType > & x_m_n,
const HostTensor< GammaDataType > & gamma_n,
const HostTensor< BetaDataType > & beta_n,
HostTensor< YDataType > & y_m_n,
HostTensor< MeanDataType > & mean_m,
HostTensor< InvStdDataType > & invStd_m,
ComputeDataType epsilon,
Epilogue epilogue_functor = {} )

◆ reference_moe_gemm_gpu()

template<typename ADataType, typename BDataType, typename AccDataType, typename CDataType, typename LayoutA, typename LayoutB, typename LayoutC, int MoeGemmKind = 0, typename ActivationOp = identity>
void ck_tile::reference_moe_gemm_gpu ( const index_t * p_sorted_token_ids_,
const index_t * p_sorted_expert_ids_,
const index_t * p_max_token_id_,
const ADataType * a_ptr,
const BDataType * b_ptr,
CDataType * c_ptr,
const AccDataType * expert_weight_ptr,
index_t Num_tokens,
index_t TokensPerBlock,
index_t TopK,
index_t M,
index_t N,
index_t K,
index_t stride_a,
index_t stride_b,
index_t stride_c,
index_t scale_granularity_m,
index_t scale_granularity_n,
index_t scale_granularity_k,
float * scale_A_ptr,
float * scale_B_ptr,
float * exp_bias = nullptr )

◆ reference_moe_sorting()

template<typename WeightType, typename IndexType = index_t>
CK_TILE_HOST void ck_tile::reference_moe_sorting ( const HostTensor< IndexType > & topk_ids,
const HostTensor< WeightType > & weights,
const HostTensor< IndexType > & local_expert_mask,
HostTensor< IndexType > & p_sorted_token_ids,
HostTensor< WeightType > & sorted_weight,
HostTensor< IndexType > & sorted_expert_ids,
index_t & unit_cnt,
const index_t experts,
const index_t unit_size,
const index_t tokens,
bool local_expert_masking,
bool skip_experts_with_zero_token = true )

◆ reference_mx_gemm()

template<typename ADataType, typename BDataType, typename ScaleDataType, typename AccDataType, typename CDataType, typename AElementOp = ck_tile::identity, typename BElementOp = ck_tile::identity, typename ACCElementOp = ck_tile::identity>
CK_TILE_HOST void ck_tile::reference_mx_gemm ( const HostTensor< ADataType > & a_m_k,
const HostTensor< BDataType > & b_k_n,
HostTensor< CDataType > & c_m_n,
const HostTensor< ScaleDataType > & scale_a,
const HostTensor< ScaleDataType > & scale_b,
const AElementOp & = {},
const BElementOp & = {},
const ACCElementOp & = {} )

◆ reference_permute() [1/2]

template<typename DataType>
CK_TILE_HOST void ck_tile::reference_permute ( const HostTensor< DataType > & x,
HostTensor< DataType > & y,
std::vector< index_t > perm )

◆ reference_permute() [2/2]

template<typename DataType>
CK_TILE_HOST auto ck_tile::reference_permute ( const HostTensor< DataType > & x,
std::vector< index_t > perm )

◆ reference_pool2d()

template<typename InDataType, typename ComputeDataType, typename OutDataType, typename IndexDataType, typename ReduceOp, typename TensorShape, typename WindowShape, bool OutputIndex = false>
CK_TILE_HOST void ck_tile::reference_pool2d ( const HostTensor< InDataType > & input,
HostTensor< OutDataType > & output,
HostTensor< IndexDataType > & output_index,
PoolKernelArgs< TensorShape, WindowShape > kargs,
ReduceOp reduce_op )

◆ reference_pool3d()

template<typename InDataType, typename ComputeDataType, typename OutDataType, typename IndexDataType, typename ReduceOp, typename TensorShape, typename WindowShape, bool OutputIndex = false>
CK_TILE_HOST void ck_tile::reference_pool3d ( const HostTensor< InDataType > & input,
HostTensor< OutDataType > & output,
HostTensor< IndexDataType > & output_index,
PoolKernelArgs< TensorShape, WindowShape > kargs,
ReduceOp reduce_op )

◆ reference_reduce() [1/2]

template<typename XDataType, typename ComputeDataType, typename YDataType, typename ReduceOp>
CK_TILE_HOST void ck_tile::reference_reduce ( const HostTensor< XDataType > & x_m_n,
HostTensor< YDataType > & y_m,
ReduceOp reduce_op )

◆ reference_reduce() [2/2]

template<typename XDataType, typename ComputeDataType, typename YDataType, typename ReduceOp, typename KeptDim, typename ReduceDims>
CK_TILE_HOST void ck_tile::reference_reduce ( const HostTensor< XDataType > & x_tensor,
HostTensor< YDataType > & y_tensor,
ReduceOp reduce_op,
KeptDim kept_dim,
ReduceDims reduce_dims )

◆ reference_rmsnorm2d_fwd()

template<typename XDataType, typename GammaDataType, typename ComputeDataType, typename YDataType, typename InvRmsDataType, typename UnquantYDataType, typename Epilogue = reference_rmsnorm2d_default_epilogue>
void ck_tile::reference_rmsnorm2d_fwd ( const HostTensor< XDataType > & x_m_n,
const HostTensor< GammaDataType > & gamma_n,
HostTensor< YDataType > & y_m_n,
HostTensor< InvRmsDataType > & invRms_m,
HostTensor< UnquantYDataType > & unquant_y_m_n,
ComputeDataType epsilon,
Epilogue epilogue_functor = {},
const int use_model_sensitive_rmsnorm = static_cast<int>(Rmsnorm2dSensitiveEnum::NO_SPECIFIC_MODEL) )

◆ reference_rowwise_quantization2d()

template<typename XDataType, typename ScaleDataType, typename QXDataType>
CK_TILE_HOST void ck_tile::reference_rowwise_quantization2d ( const HostTensor< XDataType > & x_m_n,
const HostTensor< ScaleDataType > & scale_m,
HostTensor< QXDataType > & qx_m_n )

◆ reference_softmax() [1/2]

template<typename InputType, typename ComputeType, typename OutputType = ComputeType>
CK_TILE_HOST void ck_tile::reference_softmax ( const HostTensor< InputType > & x,
HostTensor< OutputType > & y,
index_t dim = -1 )

◆ reference_softmax() [2/2]

template<typename InputType, typename ComputeType, typename OutputType = ComputeType>
CK_TILE_HOST auto ck_tile::reference_softmax ( const HostTensor< InputType > & x,
index_t dim = -1 )

◆ reference_topk() [1/2]

template<typename DataType, typename IndexType = index_t>
CK_TILE_HOST void ck_tile::reference_topk ( const HostTensor< DataType > & x,
HostTensor< DataType > & y_values,
HostTensor< IndexType > & y_indices,
index_t k,
index_t dim = -1,
bool largest = true,
bool sorted = true )

◆ reference_topk() [2/2]

template<typename DataType, typename IndexType = index_t>
CK_TILE_HOST auto ck_tile::reference_topk ( const HostTensor< DataType > & x,
index_t k,
index_t dim = -1,
bool largest = true,
bool sorted = true )

◆ reference_transpose_elementwise()

template<typename ADataType, typename BDataType>
void ck_tile::reference_transpose_elementwise ( const HostTensor< ADataType > & a,
HostTensor< BDataType > & b )

◆ reference_unary_elementwise()

template<typename ADataType, typename BDataType, typename ComputeDataType, typename ElementOp>
CK_TILE_HOST void ck_tile::reference_unary_elementwise ( const HostTensor< ADataType > & a,
HostTensor< BDataType > & b,
ElementOp element_op )

◆ replace_bottom_tensor_view() [1/3]

template<typename NewTensorView_, typename OldTensorView_, typename WindowLengths_, typename StaticTileDistribution_, typename StaticPageIndexArray_, typename StaticValidArray_, index_t HsGatherDim = 0, index_t NumCoord = 1>
CK_TILE_DEVICE auto ck_tile::replace_bottom_tensor_view ( const NewTensorView_ & new_tensor_view,
const tile_scatter_gather< OldTensorView_, WindowLengths_, StaticTileDistribution_, StaticPageIndexArray_, StaticValidArray_, HsGatherDim, NumCoord > & tile_window )

◆ replace_bottom_tensor_view() [2/3]

template<typename NewTensorView_, typename OldTensorView_, typename WindowLengths_, typename StaticTileDistribution_, index_t NumCoord = 1>
CK_TILE_DEVICE auto ck_tile::replace_bottom_tensor_view ( const NewTensorView_ & new_tensor_view,
const tile_window_with_static_distribution< OldTensorView_, WindowLengths_, StaticTileDistribution_, NumCoord > & tile_window )

◆ replace_bottom_tensor_view() [3/3]

template<typename NewTensorView_, typename OldTensorView_, typename WindowLengths_>
CK_TILE_DEVICE auto ck_tile::replace_bottom_tensor_view ( const NewTensorView_ & new_tensor_view,
const tile_window_with_static_lengths< OldTensorView_, WindowLengths_ > & tile_window )

◆ report_error_stats()

CK_TILE_HOST void ck_tile::report_error_stats ( int err_count,
double max_err,
std::size_t total_size )

Report error statistics for numerical comparisons.

Outputs statistics about numerical comparison errors including count and maximum error.

Parameters
err_countNumber of errors found
max_errMaximum error value encountered
total_sizeTotal number of elements compared

◆ reverse_exclusive_scan_sequence()

template<typename Seq, typename Reduce, index_t Init>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::reverse_exclusive_scan_sequence ( Seq ,
Reduce ,
number< Init >  )
constexpr

◆ reverse_inclusive_scan_sequence()

template<typename Seq, typename Reduce, index_t Init>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::reverse_inclusive_scan_sequence ( Seq ,
Reduce ,
number< Init >  )
constexpr

◆ reverse_slice_sequence()

template<typename Seq, index_t SliceSize, typename Mask = typename uniform_sequence_gen<Seq::size(), 1>::type>
auto ck_tile::reverse_slice_sequence ( Seq ,
number< SliceSize > ,
Mask = typename uniform_sequence_gen<Seq::size(), 1>::type{} )
constexpr

◆ s_nop()

CK_TILE_DEVICE void ck_tile::s_nop ( index_t cnt = 0)

◆ s_waitcnt()

template<index_t vmcnt = waitcnt_arg::kMaxVmCnt, index_t expcnt = waitcnt_arg::kMaxExpCnt, index_t lgkmcnt = waitcnt_arg::kMaxLgkmCnt>
CK_TILE_DEVICE void ck_tile::s_waitcnt ( )

◆ s_waitcnt_barrier()

template<index_t vmcnt = waitcnt_arg::kMaxVmCnt, index_t expcnt = waitcnt_arg::kMaxExpCnt, index_t lgkmcnt = waitcnt_arg::kMaxLgkmCnt>
CK_TILE_DEVICE void ck_tile::s_waitcnt_barrier ( )

◆ sad_u16()

CK_TILE_DEVICE uint16_t ck_tile::sad_u16 ( uint16_t x,
uint16_t y,
uint16_t acc )

◆ sad_u32()

CK_TILE_HOST uint32_t ck_tile::sad_u32 ( uint32_t x,
uint32_t y,
uint32_t acc )

TODO: replace inline asm when intrinsic is available

◆ scaled_type_convert()

template<typename Y, typename X>
CK_TILE_HOST_DEVICE constexpr Y ck_tile::scaled_type_convert ( X x,
float scale )
constexpr

◆ scales()

template<typename Scale>
__host__ __device__ ck_tile::scales ( Scale ) ->scales< Scale >

FIXME: create macro to replace 'host device' and nothing more.

◆ sequence_all_of()

template<typename Seq, typename F>
CK_TILE_HOST_DEVICE constexpr bool ck_tile::sequence_all_of ( Seq ,
F f )
constexpr

◆ sequence_any_of()

template<typename Seq, typename F>
CK_TILE_HOST_DEVICE constexpr bool ck_tile::sequence_any_of ( Seq ,
F f )
constexpr

◆ sequence_pop_back()

template<typename Seq>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::sequence_pop_back ( Seq )
constexpr

◆ sequence_pop_front()

template<index_t I, index_t... Is>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::sequence_pop_front ( sequence< I, Is... > )
constexpr

◆ sequence_to_tuple_of_number()

template<index_t... Is>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::sequence_to_tuple_of_number ( sequence< Is... > )
constexpr

◆ set_buffer_value()

template<typename T>
__global__ void ck_tile::set_buffer_value ( T * p,
T x,
uint64_t buffer_element_size )

◆ set_container_subset() [1/2]

template<typename T, index_t N, index_t... Is>
CK_TILE_HOST_DEVICE constexpr void ck_tile::set_container_subset ( array< T, N > & y,
sequence< Is... > picks,
const array< T, sizeof...(Is)> & x )
constexpr

◆ set_container_subset() [2/2]

template<typename Y, typename X, index_t... Is>
CK_TILE_HOST_DEVICE constexpr void ck_tile::set_container_subset ( Y & y,
sequence< Is... > picks,
const X & x )
constexpr

◆ set_slice_tile()

template<typename DstDataType_, typename DstStaticTileDistribution_, typename SrcDataType_, typename SrcStaticTileDistribution_, index_t... SliceBegins, index_t... SliceEnds>
CK_TILE_DEVICE constexpr auto ck_tile::set_slice_tile ( static_distributed_tensor< DstDataType_, DstStaticTileDistribution_ > & dst_tile,
const static_distributed_tensor< SrcDataType_, SrcStaticTileDistribution_ > & src_tile,
sequence< SliceBegins... > slice_begins,
sequence< SliceEnds... > slice_ends )
constexpr

◆ set_tile() [1/4]

template<typename DstrTensors, typename T>
CK_TILE_DEVICE void ck_tile::set_tile ( DstrTensors & dstr_tensor,
const T & value )

◆ set_tile() [2/4]

template<typename DstrTensors, index_t v, bool skip_subdword_opt = false>
CK_TILE_DEVICE void ck_tile::set_tile ( DstrTensors & dstr_tensor,
number< v > ,
bool_constant< skip_subdword_opt > = {} )

◆ set_tile() [3/4]

template<typename T>
CK_TILE_DEVICE void ck_tile::set_tile ( null_tensor & ,
const T &  )

◆ set_tile() [4/4]

template<index_t v>
CK_TILE_DEVICE void ck_tile::set_tile ( null_tensor & ,
number< v >  )

◆ set_tile_if()

template<typename DataType, typename StaticTileDistribution, typename XIndicesPredicate>
CK_TILE_HOST_DEVICE void ck_tile::set_tile_if ( static_distributed_tensor< DataType, StaticTileDistribution > & out_tensor,
DataType value,
XIndicesPredicate predicate )

◆ shuffle_aq()

template<typename T>
auto ck_tile::shuffle_aq ( const ck_tile::HostTensor< T > * t,
int block_aq_k )

◆ shuffle_b()

template<typename GemmConfig, typename T>
auto ck_tile::shuffle_b ( const ck_tile::HostTensor< T > & t)

◆ shuffle_b_permuteN()

template<typename GemmConfig, typename T>
auto ck_tile::shuffle_b_permuteN ( const ck_tile::HostTensor< T > & t)

◆ shuffle_bq_permuteN()

template<typename GemmConfig, typename T>
auto ck_tile::shuffle_bq_permuteN ( const ck_tile::HostTensor< T > & t)

◆ shuffle_tile()

template<typename OutTensor, typename InTensor>
CK_TILE_DEVICE void ck_tile::shuffle_tile ( OutTensor & out,
const InTensor & in )

◆ sin() [1/2]

template<typename T>
CK_TILE_DEVICE T ck_tile::sin ( T x)

◆ sin() [2/2]

template<typename T>
CK_TILE_HOST T ck_tile::sin ( T x)

◆ sin< double >() [1/2]

template<>
CK_TILE_DEVICE double ck_tile::sin< double > ( double x)

◆ sin< double >() [2/2]

template<>
CK_TILE_HOST double ck_tile::sin< double > ( double x)

◆ sin< float >() [1/2]

template<>
CK_TILE_DEVICE float ck_tile::sin< float > ( float x)

◆ sin< float >() [2/2]

template<>
CK_TILE_HOST float ck_tile::sin< float > ( float x)

◆ sin< fp16_t >()

◆ sinh() [1/2]

template<typename T>
CK_TILE_DEVICE T ck_tile::sinh ( T x)

◆ sinh() [2/2]

template<typename T>
CK_TILE_HOST T ck_tile::sinh ( T x)

◆ sinh< double >() [1/2]

template<>
CK_TILE_DEVICE double ck_tile::sinh< double > ( double x)

◆ sinh< double >() [2/2]

template<>
CK_TILE_HOST double ck_tile::sinh< double > ( double x)

◆ sinh< float >() [1/2]

template<>
CK_TILE_DEVICE float ck_tile::sinh< float > ( float x)

◆ sinh< float >() [2/2]

template<>
CK_TILE_HOST float ck_tile::sinh< float > ( float x)

◆ slice_sequence()

template<typename Seq, index_t SliceSize, typename Mask = typename uniform_sequence_gen<Seq::size(), 1>::type>
auto ck_tile::slice_sequence ( Seq ,
number< SliceSize > ,
Mask = typename uniform_sequence_gen<Seq::size(), 1>::type{} )
constexpr

◆ sqrt() [1/4]

CK_TILE_DEVICE bfloat16_t ck_tile::sqrt ( bfloat16_t x)

◆ sqrt() [2/4]

CK_TILE_DEVICE double ck_tile::sqrt ( double x)

◆ sqrt() [3/4]

CK_TILE_DEVICE float ck_tile::sqrt ( float x)

◆ sqrt() [4/4]

CK_TILE_DEVICE fp16_t ck_tile::sqrt ( fp16_t x)

◆ store_tile() [1/3]

template<typename BottomTensorView_, typename WindowLengths_, typename TileDistribution_, typename LinearBottomDims_, typename DataType_>
CK_TILE_DEVICE void ck_tile::store_tile ( tile_window_linear< BottomTensorView_, WindowLengths_, TileDistribution_, LinearBottomDims_ > & tile_window,
const static_distributed_tensor< DataType_, TileDistribution_ > & dstr_tensor )

◆ store_tile() [2/3]

template<typename BottomTensorView_, typename WindowLengths_, typename TileDistribution_, index_t NumCoord, typename DataType_>
CK_TILE_DEVICE void ck_tile::store_tile ( tile_window_with_static_distribution< BottomTensorView_, WindowLengths_, TileDistribution_, NumCoord > & tile_window,
const static_distributed_tensor< DataType_, TileDistribution_ > & dstr_tensor )

◆ store_tile() [3/3]

template<typename BottomTensorView_, typename WindowLengths_, typename TileDistribution_, typename DataType_>
CK_TILE_DEVICE void ck_tile::store_tile ( tile_window_with_static_lengths< BottomTensorView_, WindowLengths_ > & tile_window_tmp,
const static_distributed_tensor< DataType_, TileDistribution_ > & dstr_tensor )

◆ store_tile_raw() [1/3]

template<typename BottomTensorView_, typename WindowLengths_, typename TileDistribution_, typename LinearBottomDims_, typename DataType_>
CK_TILE_DEVICE void ck_tile::store_tile_raw ( tile_window_linear< BottomTensorView_, WindowLengths_, TileDistribution_, LinearBottomDims_ > & tile_window,
const static_distributed_tensor< DataType_, TileDistribution_ > & dstr_tensor )

◆ store_tile_raw() [2/3]

template<typename BottomTensorView_, typename WindowLengths_, typename TileDistribution_, index_t NumCoord, typename DataType_>
CK_TILE_DEVICE void ck_tile::store_tile_raw ( tile_window_with_static_distribution< BottomTensorView_, WindowLengths_, TileDistribution_, NumCoord > & tile_window,
const static_distributed_tensor< DataType_, TileDistribution_ > & dstr_tensor )

◆ store_tile_raw() [3/3]

template<typename BottomTensorView_, typename WindowLengths_, typename TileDistribution_, typename DataType_>
CK_TILE_DEVICE void ck_tile::store_tile_raw ( tile_window_with_static_lengths< BottomTensorView_, WindowLengths_ > & tile_window_tmp,
const static_distributed_tensor< DataType_, TileDistribution_ > & dstr_tensor )

◆ sweep_tile() [1/2]

template<typename DistributedTensor, typename F, typename UnpacksPerXDim = typename uniform_sequence_gen<DistributedTensor::get_num_of_dimension(), 1>::type>
CK_TILE_HOST_DEVICE constexpr void ck_tile::sweep_tile ( const DistributedTensor & ,
const F & f,
UnpacksPerXDim = {} )
constexpr

◆ sweep_tile() [2/2]

template<typename DistributedTensor, typename F, typename UnpacksPerXDim = typename uniform_sequence_gen<DistributedTensor::get_num_of_dimension(), 1>::type>
CK_TILE_HOST_DEVICE constexpr void ck_tile::sweep_tile ( const F & f,
UnpacksPerXDim = {} )
constexpr

◆ sweep_tile_span()

template<typename TileDistributedSpan_, typename F>
CK_TILE_DEVICE void ck_tile::sweep_tile_span ( TileDistributedSpan_ ,
const F & f )

◆ sweep_tile_uspan()

template<typename TileDistributedSpan_, typename F, typename Unpacks = typename uniform_sequence_gen<TileDistributedSpan_::Impl::size(), 1>::type>
CK_TILE_DEVICE void ck_tile::sweep_tile_uspan ( TileDistributedSpan_ ,
const F & f,
Unpacks = {} )

◆ tan() [1/2]

template<typename T>
CK_TILE_DEVICE T ck_tile::tan ( T x)

◆ tan() [2/2]

template<typename T>
CK_TILE_HOST T ck_tile::tan ( T x)

◆ tan< double >() [1/2]

template<>
CK_TILE_DEVICE double ck_tile::tan< double > ( double x)

◆ tan< double >() [2/2]

template<>
CK_TILE_HOST double ck_tile::tan< double > ( double x)

◆ tan< float >() [1/2]

template<>
CK_TILE_DEVICE float ck_tile::tan< float > ( float x)

◆ tan< float >() [2/2]

template<>
CK_TILE_HOST float ck_tile::tan< float > ( float x)

◆ tanh() [1/2]

template<typename T>
CK_TILE_DEVICE T ck_tile::tanh ( T x)

◆ tanh() [2/2]

template<typename T>
CK_TILE_HOST T ck_tile::tanh ( T x)

◆ tanh< double >() [1/2]

template<>
CK_TILE_DEVICE double ck_tile::tanh< double > ( double x)

◆ tanh< double >() [2/2]

template<>
CK_TILE_HOST double ck_tile::tanh< double > ( double x)

◆ tanh< float >() [1/2]

template<>
CK_TILE_DEVICE float ck_tile::tanh< float > ( float x)

◆ tanh< float >() [2/2]

template<>
CK_TILE_HOST float ck_tile::tanh< float > ( float x)

◆ tanh_fast()

template<typename T>
CK_TILE_DEVICE T ck_tile::tanh_fast ( T x)

◆ tanh_fast< float >()

template<>
CK_TILE_DEVICE float ck_tile::tanh_fast< float > ( float x)

◆ tie()

template<typename... Args>
tuple< Args &... > ck_tile::tie ( Args &... args)
constexprnoexcept

◆ tile_distribution_pattern_to_string()

const char * ck_tile::tile_distribution_pattern_to_string ( tile_distribution_pattern pattern)
constexpr

◆ tile_elementwise_in() [1/2]

template<typename InElementFunc, typename... MaybeNullTensor, typename = std::enable_if_t< std::disjunction_v<std::is_same<remove_cvref_t<MaybeNullTensor>, null_tensor>...>>>
CK_TILE_DEVICE auto ck_tile::tile_elementwise_in ( const InElementFunc & ,
MaybeNullTensor && ... )

◆ tile_elementwise_in() [2/2]

template<typename InElementFunc, typename... InTensor, typename = std::enable_if_t< std::conjunction_v<std::negation<std::is_same<InTensor, null_tensor>>...>>>
CK_TILE_DEVICE auto ck_tile::tile_elementwise_in ( const InElementFunc & in_element_func,
const InTensor &... in_dstr_tensors )

◆ tile_elementwise_inout() [1/2]

template<typename InOutElementFunc, typename... MaybeNullTensor, typename = std::enable_if_t< std::disjunction_v<std::is_same<remove_cvref_t<MaybeNullTensor>, null_tensor>...>>>
CK_TILE_DEVICE void ck_tile::tile_elementwise_inout ( const InOutElementFunc & ,
MaybeNullTensor && ... )

◆ tile_elementwise_inout() [2/2]

template<typename InOutElementFunc, typename... InOutDstrTensors, typename = std::enable_if_t<std::conjunction_v< std::negation<std::is_same<std::remove_const_t<InOutDstrTensors>, null_tensor>>...>>>
CK_TILE_DEVICE void ck_tile::tile_elementwise_inout ( const InOutElementFunc & inout_element_func,
InOutDstrTensors &... inout_dstr_tensors )

◆ tile_elementwise_inout_unpack() [1/2]

template<typename InElementFunc, typename Tuple>
CK_TILE_DEVICE auto ck_tile::tile_elementwise_inout_unpack ( const InElementFunc & in_element_func,
const Tuple & t )

Template function that "unpacks" a tuple and applies an element-wise operation.

Parameters
in_element_funcFunction to apply element-wise.
tAny container containing elements to process, with known size and tuple-like semantic.
Returns
Calls the overloaded function, passing an index sequence.

◆ tile_elementwise_inout_unpack() [2/2]

template<typename InElementFunc, typename Tuple, size_t... I>
CK_TILE_DEVICE auto ck_tile::tile_elementwise_inout_unpack ( const InElementFunc & in_element_func,
const Tuple & t,
std::index_sequence< I... >  )

Template function that "unpacks" a tuple and applies an element-wise operation.

Parameters
in_element_funcFunction to apply element-wise.
tAny container containing elements to process, with known size and tuple-like semantic.
Returns
Calls tile_elementwise_inout with unpacked tuple elements.

◆ tile_sweeper()

template<typename T, typename F, typename U = typename uniform_sequence_gen<T::get_num_of_dimension(), 1>::type>
CK_TILE_HOST_DEVICE_EXTERN ck_tile::tile_sweeper ( const T & ,
const F & ,
U = {} )->tile_sweeper< T, F, U >

◆ timing_loop_impl()

template<typename TimerType, typename CallablesFunc, typename PreprocessFunc = std::nullptr_t>
CK_TILE_HOST double ck_tile::timing_loop_impl ( TimerType timer,
const stream_config & s,
CallablesFunc && callables_func,
PreprocessFunc preprocess = nullptr )

◆ to_array() [1/2]

template<typename T, index_t N, typename X>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::to_array ( const std::vector< X > & x)
constexpr

◆ to_array() [2/2]

template<typename T, index_t N, typename X>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::to_array ( const X & x)
constexpr

◆ to_array_of_array()

template<typename... Seqs>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::to_array_of_array ( tuple< Seqs... > t_of_s)
constexpr

◆ to_multi_index()

template<typename T>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::to_multi_index ( const T & x)
constexpr

◆ to_sequence()

template<index_t... Is>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::to_sequence ( tuple< number< Is >... > )
constexpr

◆ transform_sequences() [1/3]

template<typename F, index_t... Xs>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::transform_sequences ( F f,
sequence< Xs... >  )
constexpr

◆ transform_sequences() [2/3]

template<typename F, index_t... Xs, index_t... Ys>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::transform_sequences ( F f,
sequence< Xs... > ,
sequence< Ys... >  )
constexpr

◆ transform_sequences() [3/3]

template<typename F, index_t... Xs, index_t... Ys, index_t... Zs>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::transform_sequences ( F f,
sequence< Xs... > ,
sequence< Ys... > ,
sequence< Zs... >  )
constexpr

◆ transform_tensor_adaptor()

template<typename OldTensorAdaptor, typename NewTransforms, typename NewLowerDimensionOldTopIdss, typename NewUpperDimensionNewTopIdss>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::transform_tensor_adaptor ( const OldTensorAdaptor & old_tensor_adaptor,
const NewTransforms & new_transforms,
NewLowerDimensionOldTopIdss ,
NewUpperDimensionNewTopIdss  )
constexpr

◆ transform_tensor_descriptor()

template<typename OldTensorDescriptor, typename NewTransforms, typename NewLowerDimensionOldTopIdss, typename NewUpperDimensionNewTopIdss>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::transform_tensor_descriptor ( const OldTensorDescriptor & old_tensor_desc,
const NewTransforms & new_transforms,
NewLowerDimensionOldTopIdss ,
NewUpperDimensionNewTopIdss  )
constexpr

◆ transform_tensor_view()

template<typename OldTensorView, typename NewTransforms, typename NewLowerDimensionOldVisibleIdss, typename NewUpperDimensionNewVisibleIdss>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::transform_tensor_view ( const OldTensorView & old_tensor_view,
const NewTransforms & new_transforms,
NewLowerDimensionOldVisibleIdss ,
NewUpperDimensionNewVisibleIdss  )
constexpr

◆ transform_tuples() [1/3]

template<typename F, typename X>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::transform_tuples ( F f,
const X & x )
constexpr

◆ transform_tuples() [2/3]

template<typename F, typename X, typename Y>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::transform_tuples ( F f,
const X & x,
const Y & y )
constexpr

◆ transform_tuples() [3/3]

template<typename F, typename X, typename Y, typename Z>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::transform_tuples ( F f,
const X & x,
const Y & y,
const Z & z )
constexpr

◆ transpose_host_tensor_descriptor_given_new2old()

template<typename New2Old>
CK_TILE_HOST HostTensorDescriptor ck_tile::transpose_host_tensor_descriptor_given_new2old ( const HostTensorDescriptor & a,
const New2Old & new2old )

◆ transpose_tile2d()

template<typename OutTensor, typename InTensor>
CK_TILE_DEVICE void ck_tile::transpose_tile2d ( OutTensor & out,
const InTensor & in )

◆ tuple_depth() [1/2]

template<index_t depth = 0, typename T>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::tuple_depth ( const T & )
constexpr

◆ tuple_depth() [2/2]

template<index_t depth = 0, typename... Ts>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::tuple_depth ( const tuple< Ts... > & )
constexpr

◆ tuple_reduce()

template<index_t Idx, index_t End, typename F, typename... Ts>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::tuple_reduce ( F && f,
const tuple< Ts... > & t )
constexpr

◆ tuple_reverse()

template<typename... Ts>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::tuple_reverse ( const tuple< Ts... > & t)
constexpr

◆ type_convert() [1/2]

template<typename Y, typename X, std::enable_if_t< std::is_const_v< Y >||std::is_const_v< X >, bool > = false>
CK_TILE_HOST_DEVICE constexpr Y ck_tile::type_convert ( X x)
constexpr

◆ type_convert() [2/2]

template<typename Y, typename X, std::enable_if_t<!(std::is_const_v< Y >||std::is_const_v< X >), bool > = false>
CK_TILE_HOST_DEVICE constexpr Y ck_tile::type_convert ( X x)
constexpr

◆ unpack()

template<typename F, typename X>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::unpack ( F && f,
X && x )
constexpr

◆ unpack2()

template<typename F, typename X, typename Y>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::unpack2 ( F && f,
X && x,
Y && y )
constexpr

◆ unroll_nested_tuple() [1/3]

template<index_t Depth = 0, index_t MaxDepth = -1, typename T>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::unroll_nested_tuple ( const T & t)
constexpr

◆ unroll_nested_tuple() [2/3]

template<index_t Depth = 0, index_t MaxDepth = -1, typename... Ts>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::unroll_nested_tuple ( const tuple< Ts... > & t)
constexpr

◆ unroll_nested_tuple() [3/3]

template<index_t Depth = 0, index_t MaxDepth = -1>
CK_TILE_HOST_DEVICE constexpr auto ck_tile::unroll_nested_tuple ( const tuple<> & t)
constexpr

◆ update_tile() [1/2]

template<typename BottomTensorView_, typename WindowLengths_, typename TileDistribution_, index_t NumCoord, typename DataType_, index_t i_access = -1, bool oob_conditional_check = true>
CK_TILE_DEVICE void ck_tile::update_tile ( tile_window_with_static_distribution< BottomTensorView_, WindowLengths_, TileDistribution_, NumCoord > & tile_window,
const static_distributed_tensor< DataType_, TileDistribution_ > & dstr_tensor,
number< i_access > = {},
bool_constant< oob_conditional_check > = {} )

◆ update_tile() [2/2]

template<typename BottomTensorView_, typename WindowLengths_, typename TileDistribution_, typename DataType_>
CK_TILE_DEVICE void ck_tile::update_tile ( tile_window_with_static_lengths< BottomTensorView_, WindowLengths_ > & tile_window_tmp,
const static_distributed_tensor< DataType_, TileDistribution_ > & dstr_tensor )

◆ update_tile_raw() [1/2]

template<typename BottomTensorView_, typename WindowLengths_, typename TileDistribution_, typename LinearBottomDims_, typename DataType_, index_t i_access = -1, bool oob_conditional_check = true, bool pre_nop = false>
CK_TILE_DEVICE auto ck_tile::update_tile_raw ( tile_window_linear< BottomTensorView_, WindowLengths_, TileDistribution_, LinearBottomDims_ > & tile_window,
const static_distributed_tensor< DataType_, TileDistribution_ > & dstr_tensor,
number< i_access > = {},
bool_constant< oob_conditional_check > = {},
bool_constant< pre_nop > = {} )

◆ update_tile_raw() [2/2]

template<typename BottomTensorView_, typename WindowLengths_, typename TileDistribution_, index_t NumCoord, typename DataType_, index_t i_access = -1, bool oob_conditional_check = true, bool pre_nop = false>
CK_TILE_DEVICE void ck_tile::update_tile_raw ( tile_window_with_static_distribution< BottomTensorView_, WindowLengths_, TileDistribution_, NumCoord > & tile_window,
const static_distributed_tensor< DataType_, TileDistribution_ > & dstr_tensor,
number< i_access > = {},
bool_constant< oob_conditional_check > = {},
bool_constant< pre_nop > = {} )

◆ UpdateEnvVar() [1/2]

template<typename EnvVar>
void ck_tile::UpdateEnvVar ( EnvVar ,
const std::string_view & val )

◆ UpdateEnvVar() [2/2]

template<typename EnvVar, typename ValueType>
void ck_tile::UpdateEnvVar ( EnvVar ,
const ValueType & val )

Updates the cached value of an environment variable.

◆ validate_gemm_stride()

void ck_tile::validate_gemm_stride ( std::string a_layout,
std::string b_layout,
std::string c_layout,
int M,
int N,
int K,
int Stride_A,
int Stride_B,
int Stride_C )
inline

◆ validate_stride()

void ck_tile::validate_stride ( std::string Layout,
int M,
int N,
int stride,
const std::string & stride_name )
inline

◆ warp_shuffle()

template<typename T>
CK_TILE_DEVICE T ck_tile::warp_shuffle ( const T & v_local,
uint32_t src_lane )

◆ warp_shuffle_down()

template<typename T>
CK_TILE_DEVICE T ck_tile::warp_shuffle_down ( const T & v_local,
uint32_t lane_delta )

◆ warp_shuffle_down_pair()

template<typename T>
CK_TILE_DEVICE auto ck_tile::warp_shuffle_down_pair ( const T & v_local)

◆ warp_shuffle_up()

template<typename T>
CK_TILE_DEVICE T ck_tile::warp_shuffle_up ( const T & v_local,
uint32_t lane_delta )

◆ welford_update()

template<typename T, bool kFastFDiv = false>
CK_TILE_DEVICE void ck_tile::welford_update ( T & mean,
T & var,
T x,
int count,
bool_constant< kFastFDiv > = {} )

Variable Documentation

◆ ALIBI

uint32_t ck_tile::ALIBI = 8U
constexpr

◆ AllConvertibleToStringView

template<typename... Ts>
bool ck_tile::AllConvertibleToStringView
inlineconstexpr
Initial value:
=
((std::is_convertible_v<Ts, std::string_view> || IsCharArray<Ts>::value ||
std::is_same_v<Ts, char>) &&
...)
Definition concat.hpp:12

◆ CUSTOM_MASK

uint32_t ck_tile::CUSTOM_MASK = 1U
constexpr

◆ ERROR_DETAIL_LIMIT

int ck_tile::ERROR_DETAIL_LIMIT = 128
constexpr

Maximum number of error values to display when checking errors.

◆ has_wmma_traits_v

template<typename Arch, typename AType, typename BType, typename CType, index_t warp_m, index_t warp_n, index_t warp_k>
bool ck_tile::has_wmma_traits_v
constexpr
Initial value:
=
static constexpr bool value
Definition warp_gemm_attribute_wmma_impl.hpp:128

◆ ignore

detail::ignore_t ck_tile::ignore
inlineconstexpr

◆ is_constant_v

template<typename T>
bool ck_tile::is_constant_v = is_constant<T>::value
inlineconstexpr

◆ is_null_tile_window_v

template<typename T>
bool ck_tile::is_null_tile_window_v = impl::is_null_tile_window<remove_cvref_t<T>>::value
constexpr

◆ is_static_v

template<typename T>
bool ck_tile::is_static_v = is_static<T>::value
inlineconstexpr

◆ is_tile_window_linear_v

template<typename T>
bool ck_tile::is_tile_window_linear_v = is_tile_window_linear<T>::value
inlineconstexpr

Helper variable template to check if a type is a linear tile window.

Equivalent to is_tile_window_linear<T>::value.

Template Parameters
TThe type to check.

◆ is_tile_window_with_static_distribution_v

template<typename T>
bool ck_tile::is_tile_window_with_static_distribution_v
inlineconstexpr
Initial value:
=
Type trait to determine if a type is a tile window with static distribution.
Definition tile_window.hpp:1192

Helper variable template to check if a type is a tile window with static distribution.

Equivalent to is_tile_window_with_static_distribution<T>::value.

Template Parameters
TThe type to check.

◆ is_tile_window_with_static_lengths_v

template<typename T>
bool ck_tile::is_tile_window_with_static_lengths_v
inlineconstexpr
Initial value:
=
Type trait to determine if a type is a tile window with static lengths.
Definition tile_window.hpp:1235

Helper variable template to check if a type is a tile window with static lengths.

Equivalent to is_tile_window_with_static_lengths<T>::value.

Template Parameters
TThe type to check.

◆ log2e_rcp_v

template<typename T = double>
T ck_tile::log2e_rcp_v = 1. / log2e<T>::value
constexpr

◆ log2e_v

template<typename T = double>
T ck_tile::log2e_v = log2e<T>::value
constexpr

◆ LOGITS_SOFT_CAP

uint32_t ck_tile::LOGITS_SOFT_CAP = 4U
constexpr

◆ SLIDING_WINDOW

uint32_t ck_tile::SLIDING_WINDOW = 2U
constexpr