#include "cuda.h" #include #include #include #define PY_SSIZE_T_CLEAN #include typedef struct { PyObject_HEAD; _Alignas(128) CUtensorMap tensorMap; } PyCUtensorMapObject; // Raises a Python exception and returns false if code is not CUDA_SUCCESS. static bool gpuAssert(CUresult code, const char *file, int line) { if (code == CUDA_SUCCESS) return true; const char *prefix = "Triton Error [CUDA]: "; const char *str; cuGetErrorString(code, &str); char err[1024] = {0}; strcat(err, prefix); strcat(err, str); PyGILState_STATE gil_state; gil_state = PyGILState_Ensure(); PyErr_SetString(PyExc_RuntimeError, err); PyGILState_Release(gil_state); return false; } // To be used only *outside* a Py_{BEGIN,END}_ALLOW_THREADS block. #define CUDA_CHECK_AND_RETURN_NULL(ans) \ do { \ if (!gpuAssert((ans), __FILE__, __LINE__)) \ goto cleanup; \ } while (0) // To be used inside a Py_{BEGIN,END}_ALLOW_THREADS block. #define CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(ans) \ do { \ if (!gpuAssert((ans), __FILE__, __LINE__)) { \ PyEval_RestoreThread(_save); \ return NULL; \ } \ } while (0) // Used to check if functions exist in old CUDA driver versions. #define INITIALIZE_FUNCTION_POINTER_IF_NULL(funcPointer, initializerFunction) \ do { \ if ((funcPointer) == NULL) { \ (funcPointer) = (initializerFunction)(); \ if ((funcPointer) == NULL) { \ goto cleanup; \ } \ } \ } while (0) static PyObject *getDeviceProperties(PyObject *self, PyObject *args) { int device_id; if (!PyArg_ParseTuple(args, "i", &device_id)) return NULL; // Get device handle CUdevice device; cuDeviceGet(&device, device_id); // create a struct to hold device properties int max_shared_mem; int max_num_regs; int multiprocessor_count; int warp_size; int sm_clock_rate; int mem_clock_rate; int mem_bus_width; CUDA_CHECK_AND_RETURN_NULL(cuDeviceGetAttribute( &max_shared_mem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, device)); CUDA_CHECK_AND_RETURN_NULL(cuDeviceGetAttribute( &max_num_regs, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, device)); CUDA_CHECK_AND_RETURN_NULL(cuDeviceGetAttribute( &multiprocessor_count, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device)); CUDA_CHECK_AND_RETURN_NULL( cuDeviceGetAttribute(&warp_size, CU_DEVICE_ATTRIBUTE_WARP_SIZE, device)); CUDA_CHECK_AND_RETURN_NULL(cuDeviceGetAttribute( &sm_clock_rate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device)); CUDA_CHECK_AND_RETURN_NULL(cuDeviceGetAttribute( &mem_clock_rate, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, device)); CUDA_CHECK_AND_RETURN_NULL(cuDeviceGetAttribute( &mem_bus_width, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, device)); return Py_BuildValue("{s:i, s:i, s:i, s:i, s:i, s:i, s:i}", "max_shared_mem", max_shared_mem, "max_num_regs", max_num_regs, "multiprocessor_count", multiprocessor_count, "warpSize", warp_size, "sm_clock_rate", sm_clock_rate, "mem_clock_rate", mem_clock_rate, "mem_bus_width", mem_bus_width); cleanup: return NULL; } static PyObject *loadBinary(PyObject *self, PyObject *args) { const char *name; const char *data; Py_ssize_t data_size; int shared; int device; if (!PyArg_ParseTuple(args, "ss#ii", &name, &data, &data_size, &shared, &device)) { return NULL; } CUfunction fun; CUmodule mod; int32_t n_regs = 0; int32_t n_spills = 0; int32_t n_max_threads = 0; // create driver handles CUcontext pctx = 0; Py_BEGIN_ALLOW_THREADS; CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuCtxGetCurrent(&pctx)); if (!pctx) { CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS( cuDevicePrimaryCtxRetain(&pctx, device)); CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuCtxSetCurrent(pctx)); } CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuModuleLoadData(&mod, data)); CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS( cuModuleGetFunction(&fun, mod, name)); // get allocated registers and spilled registers from the function CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS( cuFuncGetAttribute(&n_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, fun)); CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS( cuFuncGetAttribute(&n_spills, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, fun)); n_spills /= 4; CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuFuncGetAttribute( &n_max_threads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, fun)); // set dynamic shared memory if necessary int shared_optin; CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuDeviceGetAttribute( &shared_optin, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, device)); if (shared > 49152 && shared_optin > 49152) { CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS( cuFuncSetCacheConfig(fun, CU_FUNC_CACHE_PREFER_SHARED)); int shared_total, shared_static; CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuDeviceGetAttribute( &shared_total, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR, device)); CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuFuncGetAttribute( &shared_static, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, fun)); CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS( cuFuncSetAttribute(fun, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_optin - shared_static)); } Py_END_ALLOW_THREADS; if (PyErr_Occurred()) { return NULL; } return Py_BuildValue("(KKiii)", (uint64_t)mod, (uint64_t)fun, n_regs, n_spills, n_max_threads); } typedef CUresult (*cuOccupancyMaxActiveClusters_t)( int *numClusters, CUfunction func, const CUlaunchConfig *config); typedef CUresult (*cuTensorMapEncodeTiled_t)( CUtensorMap *tensorMap, CUtensorMapDataType tensorDataType, cuuint32_t tensorRank, void *globalAddress, const cuuint64_t *globalDim, const cuuint64_t *globalStrides, const cuuint32_t *boxDim, const cuuint32_t *elementStrides, CUtensorMapInterleave interleave, CUtensorMapSwizzle swizzle, CUtensorMapL2promotion l2Promotion, CUtensorMapFloatOOBfill oobFill); #define defineGetFunctionHandle(name, symbolName) \ static symbolName##_t name() { \ /* Open the shared library */ \ void *libHandle = dlopen("libcuda.so.1", RTLD_LAZY); \ if (!libHandle) { \ PyErr_SetString(PyExc_RuntimeError, "Failed to open libcuda.so.1"); \ return NULL; \ } \ /* Clear any existing error */ \ dlerror(); \ symbolName##_t funcHandle = (symbolName##_t)dlsym(libHandle, #symbolName); \ /* Check for errors */ \ const char *err = dlerror(); \ if (err) { \ PyErr_SetString(PyExc_RuntimeError, \ "Failed to retrieve " #symbolName " from libcuda.so.1"); \ dlclose(libHandle); \ return NULL; \ } \ return funcHandle; \ } defineGetFunctionHandle(getCuOccupancyMaxActiveClustersHandle, cuOccupancyMaxActiveClusters); defineGetFunctionHandle(getCuTensorMapEncodeTiledHandle, cuTensorMapEncodeTiled); static PyObject *occupancyMaxActiveClusters(PyObject *self, PyObject *args) { int clusterDimX = -1, clusterDimY = -1, clusterDimZ = -1, maxActiveClusters = -1; int shared = 0; CUfunction func; if (!PyArg_ParseTuple(args, "Kiiii", &func, &shared, &clusterDimX, &clusterDimY, &clusterDimZ)) { return NULL; } // Let each SM have one block int maxActiveBlocks = 1; Py_BEGIN_ALLOW_THREADS; CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuFuncSetAttribute( func, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared)); Py_END_ALLOW_THREADS; CUlaunchAttribute launchAttr[1]; launchAttr[0].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION; launchAttr[0].value.clusterDim.x = clusterDimX; launchAttr[0].value.clusterDim.y = clusterDimY; launchAttr[0].value.clusterDim.z = clusterDimZ; CUlaunchConfig config; config.gridDimX = clusterDimX; config.gridDimY = maxActiveBlocks * clusterDimY; config.gridDimZ = clusterDimZ; config.blockDimX = 128; config.blockDimY = 1; config.blockDimZ = 1; config.sharedMemBytes = shared; config.hStream = 0; config.numAttrs = 1; config.attrs = launchAttr; static cuOccupancyMaxActiveClusters_t cuOccupancyMaxActiveClusters = NULL; INITIALIZE_FUNCTION_POINTER_IF_NULL(cuOccupancyMaxActiveClusters, getCuOccupancyMaxActiveClustersHandle); Py_BEGIN_ALLOW_THREADS; CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuFuncSetAttribute( func, CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED, 1)); CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS( cuOccupancyMaxActiveClusters(&maxActiveClusters, func, &config)); Py_END_ALLOW_THREADS; return PyLong_FromLong(maxActiveClusters); cleanup: return NULL; } static PyObject *setPrintfFifoSize(PyObject *self, PyObject *args) { long size; if (!PyArg_ParseTuple(args, "l", &size)) { return NULL; } if (size < 0) { PyErr_SetString(PyExc_ValueError, "fifo size must be non-negative"); return NULL; } Py_BEGIN_ALLOW_THREADS; // Ensure we have an active context. CUcontext ctx = NULL; CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuCtxGetCurrent(&ctx)); if (!ctx) { CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS( cuDevicePrimaryCtxRetain(&ctx, /*device=*/0)); CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuCtxSetCurrent(ctx)); } // We can't set the fifo size after running a kernel that calls printf. This // is true even if the set() call is a nop and the new size is the same as the // old size. // // This is unfriendly, so check if the old size matches the new size, and skip // the set() call if so. size_t oldSize = 0; CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS( cuCtxGetLimit(&oldSize, CU_LIMIT_PRINTF_FIFO_SIZE)); if (oldSize != size) { CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS( cuCtxSetLimit(CU_LIMIT_PRINTF_FIFO_SIZE, size)); } Py_END_ALLOW_THREADS; Py_RETURN_NONE; } static PyObject *PyCUtensorMap_alloc(PyTypeObject *type, Py_ssize_t n_items) { PyCUtensorMapObject *self = NULL; void *mem = NULL; size_t size = type->tp_basicsize; if (posix_memalign(&mem, 128, size) != 0) { PyErr_NoMemory(); return NULL; } self = (PyCUtensorMapObject *)mem; PyObject_INIT(self, type); return (PyObject *)self; } static void PyCUtensorMap_dealloc(PyObject *self) { Py_TYPE(self)->tp_free(self); } static void PyCUtensorMap_free(void *ptr) { free(ptr); } // clang-format off static PyTypeObject PyCUtensorMapType = { PyVarObject_HEAD_INIT(NULL, 0) .tp_name = "triton.backends.nvidia.PyCUtensorMap", .tp_basicsize = sizeof(PyCUtensorMapObject), .tp_itemsize = 0, .tp_flags = Py_TPFLAGS_DEFAULT, .tp_doc = "", .tp_new = PyType_GenericNew, .tp_alloc = PyCUtensorMap_alloc, .tp_dealloc = (destructor)PyCUtensorMap_dealloc, .tp_free = PyCUtensorMap_free, }; // clang-format on static PyObject *fillTMADescriptor(PyObject *self, PyObject *args) { unsigned long long global_address; int swizzle; int elemSize; int elemType; PyObject *blockSize; PyObject *shape; PyObject *strides; int padding; if (!PyArg_ParseTuple(args, "KiiiOOOi", &global_address, &swizzle, &elemSize, &elemType, &blockSize, &shape, &strides, &padding)) { return NULL; } PyCUtensorMapObject *desc = (PyCUtensorMapObject *)PyObject_CallObject( (PyObject *)&PyCUtensorMapType, NULL); if (!desc) { return NULL; } PyObject *blockSizeFast = NULL; PyObject *shapeFast = NULL; PyObject *stridesFast = NULL; uint32_t blockSizeInt[5]; uint64_t shapeInt[5]; uint64_t stridesLL[5]; blockSizeFast = PySequence_Fast(blockSize, "blockSize must be a sequence"); if (!blockSizeFast) goto cleanup; int rank = PySequence_Fast_GET_SIZE(blockSizeFast); for (int i = 0; i < rank; ++i) { PyObject *item = PySequence_Fast_GET_ITEM(blockSizeFast, i); if (!PyLong_Check(item)) { PyErr_SetString(PyExc_TypeError, "block size must be an int"); goto cleanup; } blockSizeInt[rank - i - 1] = PyLong_AsLongLong(item); } shapeFast = PySequence_Fast(shape, "shape must be a sequence"); if (!shapeFast) goto cleanup; if (rank != PySequence_Fast_GET_SIZE(shapeFast)) { PyErr_SetString(PyExc_RuntimeError, "Rank mismatch"); goto cleanup; } for (int i = 0; i < rank; ++i) { PyObject *item = PySequence_Fast_GET_ITEM(shapeFast, i); if (!PyLong_Check(item)) { PyErr_SetString(PyExc_TypeError, "shape must be an int"); goto cleanup; } shapeInt[rank - i - 1] = PyLong_AsLong(item); } stridesFast = PySequence_Fast(strides, "strides must be a sequence"); if (!stridesFast) goto cleanup; if (rank != PySequence_Fast_GET_SIZE(stridesFast)) { PyErr_SetString(PyExc_RuntimeError, "Rank mismatch"); goto cleanup; } for (int i = 0; i + 1 < rank; ++i) { PyObject *item = PySequence_Fast_GET_ITEM(stridesFast, i); if (!PyLong_Check(item)) { PyErr_SetString(PyExc_TypeError, "shape must be an int"); goto cleanup; } stridesLL[rank - i - 2] = elemSize * PyLong_AsLongLong(item); } stridesLL[rank - 1] = shapeInt[rank - 1] * (rank == 1 ? elemSize : stridesLL[rank - 2]); Py_DECREF(blockSizeFast); blockSizeFast = NULL; Py_DECREF(shapeFast); shapeFast = NULL; Py_DECREF(stridesFast); stridesFast = NULL; CUtensorMapFloatOOBfill fill = (padding == 1) ? CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA : CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE; uint32_t elementStrides[5] = {1, 1, 1, 1, 1}; static cuTensorMapEncodeTiled_t cuTensorMapEncodeTiled = NULL; INITIALIZE_FUNCTION_POINTER_IF_NULL(cuTensorMapEncodeTiled, getCuTensorMapEncodeTiledHandle); CUDA_CHECK_AND_RETURN_NULL(cuTensorMapEncodeTiled( &desc->tensorMap, elemType, rank, (void *)global_address, shapeInt, stridesLL, blockSizeInt, elementStrides, CU_TENSOR_MAP_INTERLEAVE_NONE, swizzle, CU_TENSOR_MAP_L2_PROMOTION_L2_128B, fill)); return (PyObject *)desc; cleanup: Py_XDECREF(blockSizeFast); Py_XDECREF(shapeFast); Py_XDECREF(stridesFast); Py_XDECREF(desc); return NULL; } static PyMethodDef ModuleMethods[] = { {"load_binary", loadBinary, METH_VARARGS, "Load provided cubin into CUDA driver"}, {"get_device_properties", getDeviceProperties, METH_VARARGS, "Get the properties for a given device"}, {"cuOccupancyMaxActiveClusters", occupancyMaxActiveClusters, METH_VARARGS, "Python interface for cuOccupancyMaxActiveClusters function"}, {"set_printf_fifo_size", setPrintfFifoSize, METH_VARARGS, "Python interface for cuCtxSetLimit(CU_LIMIT_PRINTF_FIFO_SIZE, x), which " "controls how many bytes can be streamed from kernels before data starts " "being dropped. This inherits all the limitations of this call; in " "particular it's an error to change this value after launching any kernel " "that calls printf()."}, {"fill_tma_descriptor", fillTMADescriptor, METH_VARARGS, "doc"}, {NULL, NULL, 0, NULL} // sentinel }; static struct PyModuleDef ModuleDef = {PyModuleDef_HEAD_INIT, "cuda_utils", NULL, // documentation -1, // size ModuleMethods}; PyMODINIT_FUNC PyInit_cuda_utils(void) { if (PyType_Ready(&PyCUtensorMapType) < 0) { return NULL; } PyObject *m = PyModule_Create(&ModuleDef); if (m == NULL) { return NULL; } PyModule_AddFunctions(m, ModuleMethods); Py_INCREF(&PyCUtensorMapType); PyModule_AddObject(m, "PyCUtensorMap", (PyObject *)&PyCUtensorMapType); return m; }