/* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include "fbgemm/FbgemmBuild.h" #include "fbgemm/UtilsAvx2.h" #include "fbgemm/spmmUtilsAvx2.h" namespace fbgemm { template struct FBGEMM_API CSRMatrix { std::vector rowPtr; std::vector colIdx; std::vector values; }; /** * Tiled block CSR format * Partial blocks are zero-filled * */ template struct FBGEMM_API BCSRMatrix { using DTYPE = T; static constexpr int RB = ROW_BLOCK; // Block size for rows static constexpr int CB = COL_BLOCK; // Block size for cols // We only tile in column dimension currently // COLTILE must be a multiple of COL_BLOCK static constexpr int COLTILE = 4000; std::vector rowBPtr; // rowPtr for blocks std::vector colBIdx; // colIdx for blocks std::vector values; // Sum of all elements in a row std::vector row_offsets; int R; int C; BCSRMatrix(int Rows, int Cols) { R = Rows; C = Cols; row_offsets.resize(R, 0); } /** * @brief pack from dense to tiled block CSR format * @param R number of rows in the matrix * @param C number of columns in the matrix * @param src is the source matrix with data type DTYPE * @param ld is the leading dimension */ void pack(const DTYPE* src, size_t ld); /** * @brief pack from dense to tiled block CSR format * @param R number of rows in the matrix * @param C number of columns in the matrix * @param src is the source matrix with data type DTYPE * * leading dim of the matrix is assumed to be equal to C */ void pack(const DTYPE* src); /** * @brief unpack from tiled block CSR to dense * @param dst should be able to hold R*C elements of type DTYPE * @param ld is the leading dimension */ void unpack(DTYPE* dst, size_t ld); /* * @brief unpack from tiled block CSR to dense * @param dst should be able to hold R*C elements of type DTYPE * * leading dimension of the matrix is assumed to be equal to C */ void unpack(DTYPE* dst); }; template FBGEMM_API std::unique_ptr> fbgemmDenseToCSR(int R, int C, const T* inp, int ld); template FBGEMM_API std::unique_ptr> fbgemmDenseToCSR(int R, int C, const T* inp); template FBGEMM_API std::unique_ptr> fbgemmDenseToBCSR(int R, int C, const T* inp, int ld); template FBGEMM_API std::unique_ptr> fbgemmDenseToBCSR(int R, int C, const T* inp); /** * @param accum Controls accumulation. * 1 means we're accumulating to the C Matrix. * * Note on matrix order and layout: * Unlike other fbgemm functions that follow PyTorch convention where A * matrix is activation (so in uint8_t for quantized FC/Conv or fp32) and B * matrix is weight (so in int8_t for quantized FC/Conv or fp32), here A is * weight matrix. This is because we mostly target sparsity in weights and for * row-major layout it's more efficient to have A as a sparse matrix: for each * non-zero of A at ith row and kth column, we can access kth row of B, whose * elements are contiguous in memory. If B matrix was sparse, for each non-zero * of B at kth row and jth column, we would've needed to access kth column of A, * whose elements are not contiguous in memory with C/C++'s row-major layout. * Alternatively, we can call this function as if we're computing * C^T = B^T * A^T while maintaining PyTorch's convention that the lefthand * side matrix B is activation. If B matrix is in column-major layout, we don't * need to do an extra transposition. The C matrix will be output in * column-major layout, so if we have a back-to-back Sparse-Dense matrix-matrix * multiplications, B matrices of subsequent matrices will be already in * column-major layout. Refer to SparseDenseMMFP32Benchmark.cc for an example. * */ FBGEMM_API void SparseDenseMM( int M, int N, const int* row_ptr, const int* col_idx, const float* values, const float* B, int ldb, float* C, int ldc, bool accum = false); template FBGEMM_API void fbgemmSparseDenseInt8MM( int N, const std::unique_ptr>& bcsr, const uint8_t* B, int ldb, int32_t* C_i32, uint8_t* C_u8, int ldc, trRequantizationParams_t& rParams, bool accum = false, int thread_id = 0, int num_threads = 1); namespace internal { void SparseDenseMMAvx2( int M, int N, const int* row_ptr, const int* col_idx, const float* values, const float* B, int ldb, float* C, int ldc, bool accum = false); #if defined(FBGEMM_FBCODE) || !defined(__aarch64__) void SparseDenseMMAvx512( int M, int N, const int* row_ptr, const int* col_idx, const float* values, const float* B, int ldb, float* C, int ldc, bool accum = false); template void SparseDenseInt8MMAvx2( int N, const std::unique_ptr>& bcsr, const uint8_t* B, int ldb, int32_t* C_i32, uint8_t* C_u8, int ldc, trRequantizationParams_t& rParams, bool accum = false, int thread_id = 0, int num_threads = 1); template void SparseDenseInt8MMAvx512( int N, const std::unique_ptr>& bcsr, const uint8_t* B, int ldb, int32_t* C_i32, uint8_t* C_u8, int ldc, trRequantizationParams_t& rParams, bool accum = false, int thread_id = 0, int num_threads = 1); template void SparseDenseInt8MVAvx512( const std::unique_ptr>& bcsr, const uint8_t* B, int ldb, int32_t* C_i32, uint8_t* C_u8, trRequantizationParams_t& rParams, bool accum = false, int thread_id = 0, int num_threads = 1); #endif } // namespace internal } // namespace fbgemm