20 typename ComputeDataType,
24 typename AMmaTileDesc,
25 typename BMmaTileDesc,
26 index_t ABlockTransferSrcScalarPerVector,
27 index_t BBlockTransferSrcScalarPerVector,
43 typename ComputeDataType,
47 typename AMmaTileDesc,
48 typename BMmaTileDesc,
49 index_t ABlockTransferSrcScalarPerVector,
50 index_t BBlockTransferSrcScalarPerVector,
71 ABlockTransferSrcScalarPerVector,
72 BBlockTransferSrcScalarPerVector,
90 ABlockTransferSrcScalarPerVector,
91 BBlockTransferSrcScalarPerVector,
111 ABlockTransferSrcScalarPerVector,
112 BBlockTransferSrcScalarPerVector,
166 constexpr auto num_ds_read_inst_a =
170 constexpr auto num_ds_read_inst_b =
184 constexpr auto ds_read_a_issue_cycle =
186 constexpr auto ds_read_b_issue_cycle =
188 constexpr auto ds_read_a_mfma_rate =
189 (mfma_cycle - 4 + 2 * ds_read_a_issue_cycle - 1) / (2 * ds_read_a_issue_cycle);
190 constexpr auto ds_read_b_mfma_rate =
191 (mfma_cycle - 4 + 2 * ds_read_b_issue_cycle - 1) / (2 * ds_read_b_issue_cycle);
193 constexpr auto num_dsread_a_mfma =
194 (num_ds_read_inst_a + ds_read_a_mfma_rate - 1) / ds_read_a_mfma_rate;
195 constexpr auto num_dsread_b_mfma =
196 (num_ds_read_inst_b + ds_read_b_mfma_rate - 1) / ds_read_b_mfma_rate;
204 constexpr auto num_mfma_stage1 = num_mfma_inst - (num_dsread_a_mfma + num_dsread_b_mfma);
205 constexpr auto num_mfma_per_issue =
206 num_mfma_stage1 / (num_buffer_load_inst_a + num_buffer_load_inst_b);
207 constexpr auto num_dswrite_per_issue_a = num_ds_write_inst_a / num_buffer_load_inst_a;
208 constexpr auto num_dswrite_per_issue_b = num_ds_write_inst_b / num_buffer_load_inst_b;
214 __builtin_amdgcn_sched_group_barrier(0x200, 1, 0);
215 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);
217 __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);
218 __builtin_amdgcn_sched_group_barrier(
219 0x008, num_mfma_per_issue - num_dswrite_per_issue_a, 0);
225 __builtin_amdgcn_sched_group_barrier(0x200, 1, 0);
226 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);
228 __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);
229 __builtin_amdgcn_sched_group_barrier(
230 0x008, num_mfma_per_issue - num_dswrite_per_issue_b, 0);
235 if constexpr((num_ds_read_inst_a - (i + 1) * ds_read_a_mfma_rate) >=
238 __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0);
242 __builtin_amdgcn_sched_group_barrier(0x100,
243 num_ds_read_inst_a - (num_dsread_a_mfma - 1) *
247 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);
251 if constexpr((num_ds_read_inst_b - (i + 1) * ds_read_b_mfma_rate) >=
254 __builtin_amdgcn_sched_group_barrier(0x100, ds_read_b_mfma_rate, 0);
258 __builtin_amdgcn_sched_group_barrier(0x100,
259 num_ds_read_inst_b - (num_dsread_b_mfma - 1) *
263 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);
267 template <
bool HasMainLoop,
271 typename ABlockTransfer,
272 typename AGridBuffer,
273 typename ABlockBuffer,
274 typename ABlockTransferStep,
277 typename BBlockTransfer,
278 typename BGridBuffer,
279 typename BBlockBuffer,
280 typename BBlockTransferStep,
281 typename CThreadBuffer,
282 typename BScaleGridBuffer,
283 typename BScaleGridDesc,
284 typename BScaleThreadDesc,
285 typename BScaleThreadTransfer,
286 typename BScaleThreadTransferStep>
287 __device__
void Run(
const AGridDesc& a_grid_desc,
288 const ABlockDesc& a_block_desc,
289 ABlockTransfer& a_blockwise_copy,
290 const AGridBuffer& a_grid_buf,
291 ABlockBuffer& a_block_buf,
292 const ABlockTransferStep& a_block_copy_step,
293 const BGridDesc& b_grid_desc,
294 const BBlockDesc& b_block_desc,
295 BBlockTransfer& b_blockwise_copy,
296 const BGridBuffer& b_grid_buf,
297 BBlockBuffer& b_block_buf,
298 const BBlockTransferStep& b_block_copy_step,
299 CThreadBuffer& c_thread_buf,
301 const BScaleGridDesc& b_scale_grid_desc,
302 const BScaleThreadDesc& b_scale_thread_desc,
303 BScaleThreadTransfer& b_scale_thread_copy,
304 const BScaleGridBuffer& b_scale_grid_buf,
305 const BScaleThreadTransferStep& b_scale_thread_copy_step,
308 index_t num_loop_per_scale)
const
310 __builtin_amdgcn_sched_barrier(0);
319 b_scale_thread_desc.GetElementSpaceSize());
322 a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
323 b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
325 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
326 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
329 b_scale_thread_copy.Run(b_scale_grid_desc,
335 b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
336 b_scale_thread_copy_step.At(
Number<0>{}));
339 if(num_loop_per_scale == 1)
341 b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
342 b_scale_thread_copy_step.At(
Number<2>{}));
346 b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
347 b_scale_thread_copy_step.At(
Number<1>{}));
350 constexpr auto num_scale_k_block = BScaleThreadDesc{}.GetLength(
I1);
351 constexpr auto num_scale_krepeat =
KRepeat / num_scale_k_block;
354 a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
355 b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
358 a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
359 b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
361 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
362 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
365 c_thread_buf.Clear();
383 b_scale_thread_buf[
Number<n0 * num_scale_k_block + k0 / num_scale_krepeat>{}],
390 __builtin_amdgcn_sched_barrier(0);
393 if constexpr(HasMainLoop)
400 a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
401 b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
403 a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
404 b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
406 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
407 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
410 b_scale_thread_copy.Run(b_scale_grid_desc,
416 b_scale_thread_copy.MoveSrcSliceWindow(
417 b_scale_grid_desc, b_scale_thread_copy_step.At(
Number<0>{}));
420 if((i + 2) % num_loop_per_scale == 0)
422 b_scale_thread_copy.MoveSrcSliceWindow(
423 b_scale_grid_desc, b_scale_thread_copy_step.At(
Number<2>{}));
427 b_scale_thread_copy.MoveSrcSliceWindow(
428 b_scale_grid_desc, b_scale_thread_copy_step.At(
Number<1>{}));
438 a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
441 b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
446 using mfma_input_type =
454 a_thread_vec.template AsType<mfma_input_type>(),
455 b_thread_vec.template AsType<mfma_input_type>(),
476 b_scale_thread_buf[
Number<n0 * num_scale_k_block +
477 k0 / num_scale_krepeat>{}],
485 __builtin_amdgcn_sched_barrier(0);
488 }
while(i < (num_loop - 1));
500 a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
503 b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
508 using mfma_input_type =
514 xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
515 b_thread_vec.template AsType<mfma_input_type>(),
520 __builtin_amdgcn_sched_barrier(0);
525 using Base::a_thread_copy_;
526 using Base::a_thread_desc_;
527 using Base::b_thread_copy_;
528 using Base::b_thread_desc_;
529 using Base::c_thread_desc_;
__host__ __device__ constexpr auto make_static_buffer(Number< N >)
Definition static_buffer.hpp:186
int32_t index_t
Definition ck.hpp:299
integral_constant< index_t, N > Number
Definition number.hpp:12
TailNumber
Definition blkgemmpipe_scheduler.hpp:31
@ Full
Definition blkgemmpipe_scheduler.hpp:49
constexpr detail::ignore_t ignore
Definition utility/ignore.hpp:20
BlockGemmPipelineScheduler
Definition blkgemmpipe_scheduler.hpp:25
@ Intrawave
Definition blkgemmpipe_scheduler.hpp:26
__host__ __device__ constexpr auto make_tuple(Xs &&... xs)
Definition utility/tuple.hpp:211
__device__ void block_sync_lds()
Definition synchronization.hpp:16
__host__ __device__ BlockwiseGemmXdlops_pipeline_base(Tuple4 a_origin=CalculateAThreadOriginDataIndex(), Tuple4 b_origin=CalculateBThreadOriginDataIndex())
Constructor for BlockwiseGemmXdlops_pipeline_base.
Definition blockwise_gemm_pipeline_xdlops_base.hpp:222
__host__ static __device__ constexpr auto GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4()
Definition blockwise_gemm_pipeline_xdlops_base.hpp:280
__host__ static __device__ constexpr auto GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4()
Definition blockwise_gemm_pipeline_xdlops_base.hpp:239
static constexpr auto xdlops_gemm
Definition blockwise_gemm_pipeline_xdlops_base.hpp:54
conditional_t< std::is_same< ComputeDataType, ck::tf32_t >::value, float, ComputeDataType > ComputeDataTypeBuf
Definition blockwise_gemm_pipeline_xdlops_base.hpp:57
static constexpr BMmaTileDesc b_block_desc_n0_n1_n2_k
Definition blockwise_gemm_pipeline_xdlops_base.hpp:360
static constexpr auto I1
Definition blockwise_gemm_pipeline_xdlops_base.hpp:37
__host__ static __device__ constexpr auto GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2()
Definition blockwise_gemm_pipeline_xdlops_base.hpp:266
__host__ static __device__ constexpr auto GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2()
Definition blockwise_gemm_pipeline_xdlops_base.hpp:294
static constexpr index_t AMmaKStride
Definition blockwise_gemm_pipeline_xdlops_base.hpp:60
__host__ static __device__ constexpr auto GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2()
Definition blockwise_gemm_pipeline_xdlops_base.hpp:253
ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops > HotLoopInstList
Definition blockwise_gemm_pipeline_xdlops_base.hpp:82
__host__ __device__ constexpr auto & GetCThreadBuffer()
Definition blockwise_gemm_pipeline_xdlops_base.hpp:111
static constexpr auto I0
Definition blockwise_gemm_pipeline_xdlops_base.hpp:36
static __device__ auto CalculateCThreadOriginDataIndex(Number< m0 >, Number< n0 >, Number< xdlops_i >, Number< blk_i >)
Definition blockwise_gemm_pipeline_xdlops_base.hpp:160
static __device__ auto CalculateCThreadOriginDataIndex8D(Number< m0 >, Number< n0 >, Number< xdlops_i >, Number< blk_i >)
Definition blockwise_gemm_pipeline_xdlops_base.hpp:189
static constexpr index_t KRepeat
Definition blockwise_gemm_pipeline_xdlops_base.hpp:64
static constexpr AMmaTileDesc a_block_desc_m0_m1_m2_k
Definition blockwise_gemm_pipeline_xdlops_base.hpp:359
static constexpr index_t BMmaKStride
Definition blockwise_gemm_pipeline_xdlops_base.hpp:61
__host__ static __device__ constexpr auto MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2(const CGridDesc_G_M_N &c_grid_desc_g_m_n)
Definition blockwise_gemm_pipeline_xdlops_base.hpp:341
__host__ static __device__ constexpr auto GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2()
Definition blockwise_gemm_pipeline_xdlops_base.hpp:307
__host__ static __device__ constexpr auto MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(const CGridDesc_M_N &c_grid_desc_m_n)
Definition blockwise_gemm_pipeline_xdlops_base.hpp:324
ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops >::B_LDS_Write_Inst_Num static constexpr index_t B_LDS_Write_Inst_Num
Definition blockwise_gemm_pipeline_xdlops.hpp:46
ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops >::A_LDS_Read_Inst_Num static constexpr index_t A_LDS_Read_Inst_Num
Definition blockwise_gemm_pipeline_xdlops.hpp:49
ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops >::A_LDS_Read_Width static constexpr index_t A_LDS_Read_Width
Definition blkgemmpipe_scheduler.hpp:82
ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops >::B_LDS_Read_Inst_Num static constexpr index_t B_LDS_Read_Inst_Num
Definition blockwise_gemm_pipeline_xdlops.hpp:51
ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops >::A_LDS_Write_Inst_Num static constexpr index_t A_LDS_Write_Inst_Num
Definition blockwise_gemm_pipeline_xdlops.hpp:44
ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops >::C_MFMA_Inst_Num static constexpr index_t C_MFMA_Inst_Num
Definition blockwise_gemm_pipeline_xdlops.hpp:54
ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops >::C_MFMA_Inst_Cycle static constexpr index_t C_MFMA_Inst_Cycle
Definition blkgemmpipe_scheduler.hpp:105
ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops >::A_Buffer_Load_Inst_Num static constexpr index_t A_Buffer_Load_Inst_Num
Definition blockwise_gemm_pipeline_xdlops.hpp:39
ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops >::B_Buffer_Load_Inst_Num static constexpr index_t B_Buffer_Load_Inst_Num
Definition blockwise_gemm_pipeline_xdlops.hpp:41
ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops >::B_LDS_Read_Width static constexpr index_t B_LDS_Read_Width
Definition blkgemmpipe_scheduler.hpp:83
ck::BlockwiseGemmXdlops_pipeline_v3_b_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::ComputeDataTypeBuf typename Base::ComputeDataTypeBuf ComputeDataTypeBuf
Definition blockwise_gemm_pipeline_xdlops_v3_b_scale.hpp:145
ck::BlockwiseGemmXdlops_pipeline_v3_b_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::GlobalBufferNum static constexpr index_t GlobalBufferNum
Definition blockwise_gemm_pipeline_xdlops_v3_b_scale.hpp:149
ck::BlockwiseGemmXdlops_pipeline_v3_b_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::PrefillStages static constexpr index_t PrefillStages
Definition blockwise_gemm_pipeline_xdlops_v3_b_scale.hpp:148
ck::BlockwiseGemmXdlops_pipeline_v3_b_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::c_thread_desc_ static constexpr auto c_thread_desc_
Definition blockwise_gemm_pipeline_xdlops_base.hpp:378
ck::BlockwiseGemmXdlops_pipeline_v3_b_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::xdlops_gemm static constexpr auto xdlops_gemm
Definition blockwise_gemm_pipeline_xdlops_base.hpp:54
ck::BlockwiseGemmXdlops_pipeline_v3_b_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::b_block_desc_n0_n1_n2_k static constexpr BMmaTileDesc b_block_desc_n0_n1_n2_k
Definition blockwise_gemm_pipeline_xdlops_base.hpp:360
ck::BlockwiseGemmXdlops_pipeline_v3_b_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::BlockHasHotloop __host__ static __device__ constexpr bool BlockHasHotloop(index_t num_loop)
Definition blockwise_gemm_pipeline_xdlops_v3_b_scale.hpp:151
ck::BlockwiseGemmXdlops_pipeline_v3_b_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::I1 static constexpr auto I1
Definition blockwise_gemm_pipeline_xdlops_base.hpp:37
ck::BlockwiseGemmXdlops_pipeline_v3_b_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::HotLoopScheduler static __device__ constexpr auto HotLoopScheduler()
Definition blockwise_gemm_pipeline_xdlops_v3_b_scale.hpp:162
ck::BlockwiseGemmXdlops_pipeline_v3_b_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::b_thread_copy_ BThreadCopy b_thread_copy_
Definition blockwise_gemm_pipeline_xdlops_base.hpp:402
ck::BlockwiseGemmXdlops_pipeline_v3_b_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::I0 static constexpr auto I0
Definition blockwise_gemm_pipeline_xdlops_base.hpp:36
ck::BlockwiseGemmXdlops_pipeline_v3_b_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::a_thread_desc_ static constexpr auto a_thread_desc_
Definition blockwise_gemm_pipeline_xdlops_base.hpp:366
ck::BlockwiseGemmXdlops_pipeline_v3_b_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::BlockLoopTailNum __host__ static __device__ constexpr TailNumber BlockLoopTailNum(index_t num_loop)
Definition blockwise_gemm_pipeline_xdlops_v3_b_scale.hpp:156
ck::BlockwiseGemmXdlops_pipeline_v3_b_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::Run __device__ void Run(const AGridDesc &a_grid_desc, const ABlockDesc &a_block_desc, ABlockTransfer &a_blockwise_copy, const AGridBuffer &a_grid_buf, ABlockBuffer &a_block_buf, const ABlockTransferStep &a_block_copy_step, const BGridDesc &b_grid_desc, const BBlockDesc &b_block_desc, BBlockTransfer &b_blockwise_copy, const BGridBuffer &b_grid_buf, BBlockBuffer &b_block_buf, const BBlockTransferStep &b_block_copy_step, CThreadBuffer &c_thread_buf, const BScaleGridDesc &b_scale_grid_desc, const BScaleThreadDesc &b_scale_thread_desc, BScaleThreadTransfer &b_scale_thread_copy, const BScaleGridBuffer &b_scale_grid_buf, const BScaleThreadTransferStep &b_scale_thread_copy_step, index_t num_loop, index_t num_loop_per_scale) const
Definition blockwise_gemm_pipeline_xdlops_v3_b_scale.hpp:287
ck::BlockwiseGemmXdlops_pipeline_v3_b_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::Base BlockwiseGemmXdlops_pipeline_base< BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack > Base
Definition blockwise_gemm_pipeline_xdlops_v3_b_scale.hpp:102
ck::BlockwiseGemmXdlops_pipeline_v3_b_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::KRepeat static constexpr index_t KRepeat
Definition blockwise_gemm_pipeline_xdlops_base.hpp:64
ck::BlockwiseGemmXdlops_pipeline_v3_b_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::a_block_desc_m0_m1_m2_k static constexpr AMmaTileDesc a_block_desc_m0_m1_m2_k
Definition blockwise_gemm_pipeline_xdlops_base.hpp:359
ck::BlockwiseGemmXdlops_pipeline_v3_b_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::b_thread_desc_ static constexpr auto b_thread_desc_
Definition blockwise_gemm_pipeline_xdlops_base.hpp:372
ck::BlockwiseGemmXdlops_pipeline_v3_b_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::a_thread_copy_ AThreadCopy a_thread_copy_
Definition blockwise_gemm_pipeline_xdlops_base.hpp:401
ck::BlockwiseGemmXdlops_pipeline_v3_b_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::PrefetchStages static constexpr index_t PrefetchStages
Definition blockwise_gemm_pipeline_xdlops_v3_b_scale.hpp:147
Definition blockwise_gemm_pipeline_xdlops_v3_b_scale.hpp:37
Definition functional2.hpp:33
Definition dtype_vector.hpp:10