#include #include #include namespace at::native { inline void arange_check_bounds( const c10::Scalar& start, const c10::Scalar& end, const c10::Scalar& step) { // use double precision for validation to avoid precision issues double dstart = start.to(); double dend = end.to(); double dstep = step.to(); TORCH_CHECK(dstep > 0 || dstep < 0, "step must be nonzero"); TORCH_CHECK( std::isfinite(dstart) && std::isfinite(dend), "unsupported range: ", dstart, " -> ", dend); TORCH_CHECK( ((dstep > 0) && (dend >= dstart)) || ((dstep < 0) && (dend <= dstart)), "upper bound and lower bound inconsistent with step sign"); } template int64_t compute_arange_size(const Scalar& start, const Scalar& end, const Scalar& step) { arange_check_bounds(start, end, step); // we use double precision for (start - end) / step // to compute size_d for consistency across devices. // The problem with using accscalar_t is that accscalar_t might be float32 on gpu for a float32 scalar_t, // but double on cpu for the same, // and the effective output size starts differing on CPU vs GPU because of precision issues, which // we dont want. // the corner-case we do want to take into account is int64_t, which has higher precision than double double size_d; if constexpr (std::is_same_v) { using accscalar_t = at::acc_type; auto xstart = start.to(); auto xend = end.to(); auto xstep = step.to(); int64_t sgn = (xstep > 0) - (xstep < 0); size_d = std::ceil((xend - xstart + xstep - sgn) / xstep); } else { size_d = std::ceil(static_cast(end.to() - start.to()) / step.to()); } TORCH_CHECK(size_d >= 0 && size_d <= static_cast(std::numeric_limits::max()), "invalid size, possible overflow?"); return static_cast(size_d); } } // namespace at::native