#include #include #include namespace at::native { template inline bool fast_gather_kernel_eligible(const TensorIterator& iter, char * const out_ptr, char * const in_ptr, const size_t index_stride_bytes, const size_t element_size) { using at::native::memory::get_alignment; const auto index_element_size = iter.element_size(2); //TensorIterator strides and sizes are ordered fastest moving to slowest moving, //in contrast to regular sizes // we need contiguous source and dst slices and aligned pointers and strides and slice size to do vectorized loads // also we need idx to be expanded in the last dimension so we can copy entire slices // and we need the src tensor to keep 0 stride from restriding // (it could have been deleted by dimension collapse, in this case iterator would still be 2d // but we cannot use fast path) return iter.ndim() == 2 && iter.strides(2)[0]==0 && iter.strides(2)[1]==index_element_size && static_cast(iter.strides(0)[0])==element_size && static_cast(iter.strides(1)[0])==element_size && static_cast(iter.strides(1)[1] == 0) && get_alignment(out_ptr) == alignment && get_alignment(in_ptr) == alignment && get_alignment(static_cast(iter.shape()[0] * element_size)) == alignment && get_alignment(static_cast(index_stride_bytes)) == alignment && get_alignment(static_cast(iter.strides(0)[1])) == alignment; } template void vectorized_gather_kernel_launch(char * out, char * inp, index_t * idx, int num_ind, int64_t slice_size_in_bytes, int64_t ind_dim_size, int64_t inp_stride_bytes, int64_t out_stride_bytes, bool allow_neg_indices=false); }