#pragma once #include #include namespace at::native { class QLinearOnednn final { public: C10_API static Tensor run_pointwise_tensor( Tensor act, // int8 CPU tensor, not QTensor Tensor act_scale, Tensor act_zero_point, Tensor onednn_weight, // int8 tensor from MkldnnCPU Tensor weight_scales, Tensor weight_zero_points, std::optional bias, double output_scale, int64_t output_zero_point, std::optional output_dtype, std::string_view post_op_name, c10::List> post_op_args, std::string_view post_op_algorithm); C10_API static Tensor run_pointwise_binary_tensor( Tensor act, // int8 CPU tensor, not QTensor Tensor act_scale, Tensor act_zero_point, Tensor onednn_weight, // int8 tensor from MkldnnCPU Tensor weight_scales, Tensor weight_zero_points, std::optional other, // extra input for binary post-op std::optional bias, double output_scale, int64_t output_zero_point, std::optional output_dtype, double other_scale, int64_t other_zero_point, std::string_view binary_post_op, // e.g. "none", "sum", "add" double binary_alpha, std::string_view unary_post_op, // e.g. "none", "relu" c10::List> unary_post_op_args, std::string_view unary_post_op_algorithm); }; C10_API Tensor _weight_int4pack_mm_cpu_tensor( const Tensor& A, const Tensor& B, const Tensor& qGroupSize, const Tensor& qScaleAndZeros); } // namespace at::native