gridwise_multiblock_reduce_second_half_batchnorm_backward_final.hpp Source File#
gridwise_multiblock_reduce_second_half_batchnorm_backward_final.hpp
Go to the documentation of this file.
149 // Step 1: Second half of Reduction: dbias = sum(dy), dscale = sum(dy * (x-mean) * inv-variance)
150 // Step 2: calculating dx = 1/reduce_size * inv-variance * scale * (reduce_size * dy - dbias - dscale * (x - mean) * inv-variance)) elementwise-ly
Definition ck.hpp:268
__host__ __device__ constexpr auto make_multi_index(Xs &&... xs)
Definition array_multi_index.hpp:15
__host__ __device__ constexpr auto make_cluster_descriptor(const Lengths &lengths, ArrangeOrder order=typename arithmetic_sequence_gen< 0, Lengths::Size(), 1 >::type{})
Definition tensor_description/cluster_descriptor.hpp:13
__host__ __device__ constexpr Y type_convert(X x)
Definition utility/type_convert.hpp:98
__global__ void kernel_reduce_second_half_batchnorm_backward_final(const XYGridDesc_M_K x_grid_desc_m_k, const XYGridDesc_M_K dy_grid_desc_m_k, const XYGridDesc_M_K dx_grid_desc_m_k, const DscaleDbiasGridDesc_M_K dscale_dbias_grid_desc_m_k, const MeanVarGridDesc_M mean_var_grid_desc_m, const ScaleBiasGridDesc_M scale_grid_desc_m, const ScaleBiasGridDesc_M bias_grid_desc_m, index_t blkgroup_size, long_index_t reduce_size, index_t num_xy_k_block_tile_iteration, index_t num_dscale_dbias_k_block_tile_iteration, const DscaleDbiasDataType *const __restrict__ p_reduce_dscale, const DscaleDbiasDataType *const __restrict__ p_reduce_dbias, const MeanVarDataType *const __restrict__ p_mean, const MeanVarDataType *const __restrict__ p_inv_var, const XDataType *const __restrict__ p_x, const DyDataType *const __restrict__ p_dy, const ScaleDataType *const __restrict__ p_scale, const DyElementwiseOp dy_elementwise_op, DxDataType *const __restrict__ p_dx, DscaleDbiasDataType *const __restrict__ p_dscale, DscaleDbiasDataType *const __restrict__ p_dbias)
Definition gridwise_multiblock_reduce_second_half_batchnorm_backward_final.hpp:26
__host__ __device__ constexpr auto make_naive_tensor_descriptor_packed(const Tuple< Lengths... > &lengths)
Definition tensor_descriptor_helper.hpp:101
__host__ __device__ constexpr auto make_tuple(Xs &&... xs)
Definition utility/tuple.hpp:211
__host__ __device__ constexpr auto make_dynamic_buffer(T *p, ElementSpaceSize element_space_size)
Definition dynamic_buffer.hpp:472
Definition gridwise_multiblock_reduce_second_half_batchnorm_backward_final.hpp:99
Sequence< MThreadClusterSize, KThreadClusterSize > ThreadClusterLengths_M_K
Definition gridwise_multiblock_reduce_second_half_batchnorm_backward_final.hpp:110
static constexpr index_t K_BlockTileSize
Definition gridwise_multiblock_reduce_second_half_batchnorm_backward_final.hpp:145
decltype(make_naive_tensor_descriptor_packed( make_tuple(Number< MThreadSliceSize >{}, Number< 1 >{}))) ThreadReduceSrcDesc_M_1
Definition gridwise_multiblock_reduce_second_half_batchnorm_backward_final.hpp:121
static constexpr index_t M_BlockTileSize
Definition gridwise_multiblock_reduce_second_half_batchnorm_backward_final.hpp:144
static constexpr bool reorder_thread_cluster
Definition gridwise_multiblock_reduce_second_half_batchnorm_backward_final.hpp:108
static __device__ void Run(const XYGridDesc_M_K &x_grid_desc_m_k, const XYGridDesc_M_K &dy_grid_desc_m_k, const XYGridDesc_M_K &dx_grid_desc_m_k, const DscaleDbiasGridDesc_M_K &dscale_dbias_grid_desc_m_k, const MeanVarGridDesc_M &mean_var_grid_desc_m, const ScaleBiasGridDesc_M &scale_grid_desc_m, const ScaleBiasGridDesc_M &dscale_dbias_grid_desc_m, index_t blkgroup_size, long_index_t reduce_size, index_t num_xy_k_block_tile_iteration, index_t num_dscale_dbias_k_block_tile_iteration, const DscaleDbiasDataType *const __restrict__ p_reduce_dscale, const DscaleDbiasDataType *const __restrict__ p_reduce_dbias, const MeanVarDataType *const __restrict__ p_mean, const MeanVarDataType *const __restrict__ p_inv_var, const XDataType *const __restrict__ p_x, const DyDataType *const __restrict__ p_dy, const ScaleDataType *const __restrict__ p_scale, const DyElementwiseOp dy_elementwise_op, DxDataType *const __restrict__ p_dx, DscaleDbiasDataType *const __restrict__ p_dscale, DscaleDbiasDataType *const __restrict__ p_dbias)
Definition gridwise_multiblock_reduce_second_half_batchnorm_backward_final.hpp:152
static constexpr auto I1
Definition gridwise_multiblock_reduce_second_half_batchnorm_backward_final.hpp:142
tensor_operation::element_wise::PassThrough PassThroughOp
Definition gridwise_multiblock_reduce_second_half_batchnorm_backward_final.hpp:139
typename conditional< reorder_thread_cluster, Sequence< 1, 0 >, Sequence< 0, 1 > >::type ThreadBufferDimAccessOrder
Definition gridwise_multiblock_reduce_second_half_batchnorm_backward_final.hpp:112
typename conditional< reorder_thread_cluster, Sequence< 1, 0 >, Sequence< 0, 1 > >::type ThreadClusterArrangeOrder
Definition gridwise_multiblock_reduce_second_half_batchnorm_backward_final.hpp:115
PartitionedBlockwiseReduction< AccDataType, BlockSize, ThreadClusterLengths_M_K, ThreadClusterArrangeOrder, ck::reduce::Add, false > BlockwiseReduce
Definition gridwise_multiblock_reduce_second_half_batchnorm_backward_final.hpp:126
ThreadwiseReduction< AccDataType, ThreadReduceSrcDesc_M_1, ThreadReduceDstDesc_M, ck::reduce::Add, false > ThreadwiseReduce
Definition gridwise_multiblock_reduce_second_half_batchnorm_backward_final.hpp:133
static constexpr auto I0
Definition gridwise_multiblock_reduce_second_half_batchnorm_backward_final.hpp:141
static constexpr auto thread_cluster_desc
Definition gridwise_multiblock_reduce_second_half_batchnorm_backward_final.hpp:118
decltype(make_naive_tensor_descriptor_packed(make_tuple(Number< MThreadSliceSize >{}))) ThreadReduceDstDesc_M
Definition gridwise_multiblock_reduce_second_half_batchnorm_backward_final.hpp:123
Definition reduction_functions_blockwise.hpp:28
static __device__ void Reduce(BufferType &work_buffer, AccDataType &in_out_value)
Definition reduction_functions_blockwise.hpp:44
Definition utility/sequence.hpp:43
Definition static_buffer.hpp:16
Definition reduction_functions_threadwise.hpp:23
static __device__ void Reduce(const SrcBufferType &src_buf, DstBufferType &dst_buf)
Definition reduction_functions_threadwise.hpp:36
Definition threadwise_tensor_slice_transfer.hpp:39
__device__ void Run(const SrcDesc &, const SrcSliceOriginIdx &, const SrcBuffer &src_buf, const DstDesc &dst_desc, DstBuffer &dst_buf)
Definition threadwise_tensor_slice_transfer.hpp:66
Helper structure that facilitates transfer of source (grid) data to destination threads.
Definition threadwise_tensor_slice_transfer.hpp:234
Definition utility/functional.hpp:100
Definition reduction_operator.hpp:37
Definition functional2.hpp:33
Definition tensor_operation/gpu/element/unary_element_wise_operation.hpp:340