K idddlZddlZddlZddlZddlZddlZddlmZddlmZddl m Z ddl m Z ddl mZddlmZej"j%ej"j'eZej"j+edgZej"j+ed Zd gZdaej4d Zej4d ZGd deZdZddddddZ ddddddZ!dZ"e#e"Z$dZ%e&de'dDZ(de(d<de(d<de(d<dZ)d Z*Gd!d"eZ+Gd#d$eZ,y)%N)Path)knobs)compile_module_from_src) _allocation) GPUTarget) GPUDriverincludelibcudac tjjx}r|gStjddgj d}|j Dcgc]}d|vs|jd}}|Dcgc]!}tjj|#}}tjd}|r^|s\|jdDcgc]B}tjjtjj|dsA|D}}d }|r|d t|zz }|d z }n |d z }|d z }td|DsJ||Scc}wcc}wcc}w)Nz/sbin/ldconfigz-pignore)errors libcuda.so.1LD_LIBRARY_PATH:zlibcuda.so cannot found! z!Possible files are located at %s.z:Please create a symlink of libcuda.so to any of the files.zzlibcuda_dirs..)s,Sdrww~~bggll4@ASsAA )rnvidia libcuda_path subprocess check_outputdecode splitlinessplitrrdirnamegetenvrrstrany) env_libcuda_pathlibslinelocslocdirsenv_ld_library_pathdirmsgs r libcuda_dirsr/sW <<4444 !!  " "$4d#; < C C8 C TD*.): UnPT>TDJJL  UD U,0 1SBGGOOC 1D 1))$564288=sPRPWPWP\P\]`bpPqArss &C  2SY>> KK MM >> SdS SXUXX S K V 1ts E#E>&E AE%E%c$tgtSN) libdevice_dirr/r library_dirsr5-s  +LN ++r4c$eZdZfdZdZxZS) CudaUtilscdt|dstt|||_|jS)Ninstance)hasattrsuperr7__new__r9)cls __class__s rr<zCudaUtils.__new__9s*sJ' C8=CL||r4cxtttjj t dj dttt}|ja |j|_ |j|_ |j|_ |j|_|j|_y)Nzdriver.c cuda_utilssrcnamer5 include_dirs libraries)rrrrrr" read_textr5rDrE PyCUtensorMap load_binaryget_device_propertiescuOccupancyMaxActiveClustersset_printf_fifo_sizefill_tma_descriptor)selfmods r__init__zCudaUtils.__init__>s%RWW\\':67AAC%%  )) ??%(%>%>",/,L,L)$'$<$<!#&#:#: r4)__name__ __module__ __qualname__r<rO __classcell__r>s@rr7r77s  ;r4r7c|ddk(ry|jdryiddddd d d d d ddddddddddddddddddddddd|S)Nr* CUdeviceptr tensordesc CUtensorMapi1int8_ti8i16int16_ti32int32_ti64int64_tu1uint8_tu8u16uint16_tu32uint32_tu64uint64_tfp16doublebf16fp32f32fp64 nvTmaDesc) startswith)tys r ty_to_cppruTs !u| }}\"  h  h  y  y   y   i   i  z  z  z        x    ]! " #  r4rgrirk)rlrnrorprq pack_fp16 pack_bf16 pack_fp32 pack_fp64 iiiKKppOOOOOOcfd}fdfdfd||j}t|Dcic]\}}|| }}}dj|jDcgc] }| c}}t|z} g} |jD] } | |  t| Dcic]\}}|| }}}t |dkDr)ddjd|j Dznd} g} |j D]P\}}|d k(r |t vr| jt |d |2| jt|d |Rdj| }g}|j D]u\}}|dd k(r|jd |d $|t vr|jd|dB|dk(r|jd|\|d k7sb|jd|wtt |}d}|j Dcgc]\}}|dd k(rd|d|d|d|d }}}|j Dcgc]\}}|dk(r d|d|d|d}}}|j Dcgc])\}}|t vrt |d|dt|d|d+}}}|j Dcgc]\}}|d k7s d|}}}|jd |jd!d"t |dkDrd|zndd#dj|d$|j|j Dcgc]\}}|d|d%c}}d&| d'| d(|j|d|j|d|j|d)t |dkDrddj|zndd*}|Scc}}wcc}wcc}}wcc}}wcc}}wcc}}wcc}}wcc}}w)+Ncg}d}|D]1}t|tr |jdr r |nd}|dz }tjd|}|j d}|j d}|j ddz}|J|jd|ztd|zD]} |jd|jd n|jd t|D]} |jd t|D]} |jd!|j|4 r|t k(sJ|S) NrrXztensordesc<([^[>]*)\[([^]]*)\],rVrarZrrr_) isinstancer$rsrematchgroupcountappendrangelen) signatureoutputtensordesc_idxsigmetardtypeshapendim_tensordesc_metas r_expand_signaturez(make_launcher.._expand_signaturesC #C#s#|(D:I~6t!#!CSI A A{{3'!+<MM#+. #1t8_- e,-MM$'MM+.t)AMM%()t)AMM%() c"9 #<#nO8L&LLL r4cjt|tr|D] }|| y|j|yr1)rtupler)rrx_flatten_signatures rrz)make_launcher.._flatten_signatures4 c5 ! ."1f- . MM# r4ct|tr!djt|}d|dS|ddk(ry|dvryt |S)Nr[]rrVz PyObject* constexprrr)rrrmapru)rtval_extracted_types rrz&make_launcher.._extracted_typesT b% ((334Cse1:  a5C< + +}r4c t|tr!djt|}d|dS|ddk(ry|dvry|j dryd d d d d dddddd t |S)N()rrVOrrXdlbhiLBHIK) rmlongr[r^r`rbrdrgrirk)rrrrrsru)rtr format_ofs rrz make_launcher..format_ofs b% ''#i,-Cse1:  a5C< + + == &   B-  r4rrz, c3,K|] \}}d|yw)z&_argNr3)rrrts rrz make_launcher..s LB5 Lsrz argrVptr_infoz.dev_ptr_arg_storagerrz*tma_ptrz zDevicePtrInfo ptr_infoz = getPointer(_argz); if (!ptr_infoz.valid) return NULL;zCUtensorMap* tma_ptrz = getTmaDesc(_argz); if (!tma_ptrz) return NULL;z _argz _storage = z(_argz);z&argz&global_scratchz&profile_scratcha #include "cuda.h" #include #include #include #define PY_SSIZE_T_CLEAN #include typedef struct { PyObject_HEAD; _Alignas(128) CUtensorMap tensorMap; } PyCUtensorMapObject; static inline void gpuAssert(CUresult code, const char *file, int line) { if (code != CUDA_SUCCESS) { const char* prefix = "Triton Error [CUDA]: "; const char* str; cuGetErrorString(code, &str); char err[1024] = {0}; strcat(err, prefix); strcat(err, str); PyGILState_STATE gil_state; gil_state = PyGILState_Ensure(); PyErr_SetString(PyExc_RuntimeError, err); PyGILState_Release(gil_state); } } #define CUDA_CHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); } typedef CUresult (*cuLaunchKernelEx_t)(const CUlaunchConfig* config, CUfunction f, void** kernelParams, void** extra); static cuLaunchKernelEx_t getLaunchKernelExHandle() { // Open the shared library void* handle = dlopen("libcuda.so.1", RTLD_LAZY); if (!handle) { PyErr_SetString(PyExc_RuntimeError, "Failed to open libcuda.so.1"); return NULL; } // Clear any existing error dlerror(); cuLaunchKernelEx_t cuLaunchKernelExHandle = (cuLaunchKernelEx_t)dlsym(handle, "cuLaunchKernelEx"); // Check for errors const char *dlsym_error = dlerror(); if (dlsym_error) { PyErr_SetString(PyExc_RuntimeError, "Failed to retrieve cuLaunchKernelEx from libcuda.so.1"); return NULL; } return cuLaunchKernelExHandle; } static void _launch(int gridX, int gridY, int gridZ, int num_warps, int num_ctas, int launch_cooperative_grid, int launch_pdl, int clusterDimX, int clusterDimY, int clusterDimZ, int shared_memory, CUstream stream, CUfunction function, CUdeviceptr global_scratch, CUdeviceptr profile_scratchz) { void *params[] = { a }; if (gridX*gridY*gridZ > 0) { // 4 attributes that we can currently pass maximum CUlaunchAttribute launchAttr[4]; static cuLaunchKernelEx_t cuLaunchKernelExHandle = NULL; if (cuLaunchKernelExHandle == NULL) { cuLaunchKernelExHandle = getLaunchKernelExHandle(); } CUlaunchConfig config; config.gridDimX = gridX; config.gridDimY = gridY; config.gridDimZ = gridZ; if (num_ctas != 1) { config.gridDimX *= clusterDimX; config.gridDimY *= clusterDimY; config.gridDimZ *= clusterDimZ; } config.blockDimX = 32 * num_warps; config.blockDimY = 1; config.blockDimZ = 1; config.sharedMemBytes = shared_memory; config.hStream = stream; config.attrs = launchAttr; int num_attrs = 0; if (launch_pdl != 0) { CUlaunchAttribute pdlAttr = { .id = CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION, .value = 1}; launchAttr[num_attrs] = pdlAttr; ++num_attrs; } if (launch_cooperative_grid != 0) { CUlaunchAttribute coopAttr = { .id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE, .value = 1}; launchAttr[num_attrs] = coopAttr; ++num_attrs; } if (num_ctas != 1) { CUlaunchAttribute clusterAttr = {}; clusterAttr.id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION; clusterAttr.value.clusterDim.x = clusterDimX; clusterAttr.value.clusterDim.y = clusterDimY; clusterAttr.value.clusterDim.z = clusterDimZ; launchAttr[num_attrs] = clusterAttr; ++num_attrs; CUlaunchAttribute clusterSchedulingAttr = {}; clusterSchedulingAttr.id = CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE; clusterSchedulingAttr.value.clusterSchedulingPolicyPreference = CU_CLUSTER_SCHEDULING_POLICY_SPREAD; launchAttr[num_attrs] = clusterSchedulingAttr; ++num_attrs; } config.numAttrs = num_attrs; CUDA_CHECK(cuLaunchKernelExHandle(&config, function, params, 0)); } } typedef struct _DevicePtrInfo { CUdeviceptr dev_ptr; bool valid; } DevicePtrInfo; static PyObject* data_ptr_str = NULL; static PyObject* py_tensor_map_type = NULL; static inline DevicePtrInfo getPointer(PyObject *obj, int idx) { DevicePtrInfo ptr_info; ptr_info.dev_ptr = 0; ptr_info.valid = true; if (PyLong_Check(obj)) { ptr_info.dev_ptr = PyLong_AsUnsignedLongLong(obj); return ptr_info; } if (obj == Py_None) { // valid nullptr return ptr_info; } PyObject *ret = PyObject_CallMethodNoArgs(obj, data_ptr_str); if (!ret) { PyErr_SetString(PyExc_TypeError, "Pointer argument must be either uint64 or have data_ptr method"); ptr_info.valid = false; goto cleanup; } if (!PyLong_Check(ret)) { PyErr_SetString(PyExc_TypeError, "data_ptr method of Pointer object must return 64-bit int"); ptr_info.valid = false; goto cleanup; } ptr_info.dev_ptr = PyLong_AsUnsignedLongLong(ret); if(!ptr_info.dev_ptr) return ptr_info; uint64_t dev_ptr; int status = cuPointerGetAttribute(&dev_ptr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER, ptr_info.dev_ptr); if (status == CUDA_ERROR_INVALID_VALUE) { PyErr_Format(PyExc_ValueError, "Pointer argument (at %d) cannot be accessed from Triton (cpu tensor?)", idx); ptr_info.valid = false; } else if (status != CUDA_SUCCESS) { CUDA_CHECK(status); // Catch any other cuda API errors ptr_info.valid = false; } ptr_info.dev_ptr = dev_ptr; cleanup: Py_XDECREF(ret); return ptr_info; } static inline CUtensorMap* getTmaDesc(PyObject *obj) { if (sizeof(CUtensorMap*) != 8) { PyErr_SetString(PyExc_SystemError, "getTmaDesc() requires 64-bit compilation"); return NULL; } if (Py_TYPE(obj) != (PyTypeObject*)py_tensor_map_type) { PyErr_Format(PyExc_TypeError, "object must be of type PyCUtensorMap, got %s", Py_TYPE(obj)->tp_name); return NULL; } CUtensorMap* map = &((PyCUtensorMapObject*)obj)->tensorMap; uintptr_t align_128 = (uintptr_t)map & (128 - 1); if (align_128 != 0) { PyErr_Format(PyExc_ValueError, "CUtensorMap must be aligned to 128B, but got (&map) mod 128 = %ld", align_128); return NULL; } return map; } static void ensureCudaContext() { CUcontext pctx; CUDA_CHECK(cuCtxGetCurrent(&pctx)); if (!pctx) { // Ensure device context. CUdevice device; CUDA_CHECK(cuDeviceGet(&device, 0)); CUDA_CHECK(cuDevicePrimaryCtxRetain(&pctx, device)); CUDA_CHECK(cuCtxSetCurrent(pctx)); } } static uint16_t pack_fp16(double f) { uint16_t result; // from https://github.com/python/pythoncapi-compat #if 0x030600B1 <= PY_VERSION_HEX && PY_VERSION_HEX <= 0x030B00A1 && !defined(PYPY_VERSION) _PyFloat_Pack2(f, (unsigned char*)&result, 1); #else PyFloat_Pack2(f, (unsigned char*)&result, 1); #endif return result; } static uint16_t pack_bf16(double f) { float f32 = (float)f; uint32_t u32 = *(uint32_t*)&f32; return (uint16_t)(u32 >> 16); } static uint32_t pack_fp32(double f) { float f32 = (float)f; return *(uint32_t*)&f32; } static uint64_t pack_fp64(double f) { return *(uint64_t*)&f; } static PyObject* launch(PyObject* self, PyObject* args) { // ensure cuda context is valid before calling any CUDA APIs, e.g. before getPointer calls cuPointerGetAttributes ensureCudaContext(); int gridX, gridY, gridZ; uint64_t _stream; uint64_t _function; int launch_cooperative_grid; int launch_pdl; PyObject *launch_enter_hook = NULL; PyObject *launch_exit_hook = NULL; PyObject *kernel_metadata = NULL; PyObject *launch_metadata = NULL; PyObject *global_scratch_obj = NULL; PyObject *profile_scratch_obj = NULL; ;z if(!PyArg_ParseTuple(args, "aM", &gridX, &gridY, &gridZ, &_stream, &_function, &launch_cooperative_grid, &launch_pdl, &global_scratch_obj, &profile_scratch_obj, &kernel_metadata, &launch_metadata, &launch_enter_hook, &launch_exit_hookaT)) { return NULL; } int num_warps, num_ctas, shared_memory, clusterDimX, clusterDimY, clusterDimZ; if (!PyArg_ParseTuple(kernel_metadata, "iiiiii", &num_warps, &num_ctas, &shared_memory, &clusterDimX, &clusterDimY, &clusterDimZ)) { PyErr_SetString(PyExc_TypeError, "kernel_metadata must be a tuple"); return NULL; } // extract launch metadata if (launch_enter_hook != Py_None){ PyObject* ret = PyObject_CallOneArg(launch_enter_hook, launch_metadata); if (!ret) return NULL; Py_DECREF(ret); } CUdeviceptr global_scratch = 0; if (global_scratch_obj != Py_None) { DevicePtrInfo global_scratch_info = getPointer(global_scratch_obj, -1); if (!global_scratch_info.valid) { return NULL; } global_scratch = global_scratch_info.dev_ptr; } CUdeviceptr profile_scratch = 0; if (profile_scratch_obj != Py_None) { DevicePtrInfo profile_scratch_info = getPointer(profile_scratch_obj, -1); if (!profile_scratch_info.valid) { return NULL; } profile_scratch = profile_scratch_info.dev_ptr; } // raise exception asap z Py_BEGIN_ALLOW_THREADS; _launch(gridX, gridY, gridZ, num_warps, num_ctas, launch_cooperative_grid, launch_pdl, clusterDimX, clusterDimY, clusterDimZ, shared_memory, (CUstream)_stream, (CUfunction)_function, global_scratch, profile_scratchap); Py_END_ALLOW_THREADS; if (PyErr_Occurred()) { return NULL; } if(launch_exit_hook != Py_None){ PyObject* ret = PyObject_CallOneArg(launch_exit_hook, launch_metadata); if (!ret) return NULL; Py_DECREF(ret); } Py_RETURN_NONE; } static PyMethodDef ModuleMethods[] = { {"launch", launch, METH_VARARGS, "Entry point for all kernels with this signature"}, {NULL, NULL, 0, NULL} // sentinel }; static struct PyModuleDef ModuleDef = { PyModuleDef_HEAD_INIT, "__triton_launcher", NULL, //documentation -1, //size ModuleMethods }; PyMODINIT_FUNC PyInit___triton_launcher(void) { data_ptr_str = PyUnicode_InternFromString("data_ptr"); if(data_ptr_str == NULL) { return NULL; } PyObject* driver_mod = PyImport_ImportModule("triton.backends.nvidia.driver"); if (driver_mod == NULL) { return NULL; } py_tensor_map_type = PyObject_GetAttrString(driver_mod, "PyCUtensorMap"); if (py_tensor_map_type == NULL) { return NULL; } PyObject *m = PyModule_Create(&ModuleDef); if(m == NULL) { return NULL; } PyModule_AddFunctions(m, ModuleMethods); return m; } ) values enumerater_BASE_ARGS_FORMATritemsFLOAT_STORAGE_TYPErrurFLOAT_PACK_FUNCTION) constantsrrrexpand_signaturersrt args_formatformatflat_signaturer args_list arg_decl_list arg_declsinternal_args_listparamsnewline ptr_decls tma_declsfloat_storage_declsrBrrrs ` @@@r make_launcherrs%N.))9)9);<"+,<"=>$!QA>I>''93C3C3EFR9R=FGK  ,FN!03/0"+N";<$!QA "FG__& Ar a5C< !#5aS1#=MaSPdeI foetetev\a\]_a   qc!3A3oaSWI __& Ar # # b ! "%s+6I"6M5NeTUSVVXY '0oo&7 MUQ2;LQCj MF M MM#$ MM$%5cjx{|ExFIJxJdhktdtPRcSSyy()y*r << @QRuq"OB'(aS2RSTU &x(QR[P[%\J <<  <<  <<#$%&YBEFXBY\]B]Z^aeajajk}a~Z~ceYf2fyN C^ JI ?F =8  NdSs5 O*O"7 O')!O- O3.O9 O?#O?Pc#$K|]}||f ywr1r3)rrs rrr[s:1A:s  c B|L|jg|j|j|jdk(|j|jS|d}|d}|d}|d}|d}|j}|j}|ddk(sJ|jdk(rdnd } |rt |}|dxxd zcc<t j jjjj|jj||t||||| } | g||S) Nnanswizzle elem_size elem_type block_size fp4_paddedrr}rr~) baserstridespaddinglisttritonruntimedriveractiveutilsrLdata_ptrTMA_DTYPE_DEVICE_TO_HOST) argmetadatarrrrrrrr cu_tensor_maps rmake_tensordesc_argras2c399cs{{cCKK54Hc399cWZWbWbccy!G%I%I,'J,'J IIEkkG 2;!  ;;%'aQGU  b Q NN))0066JJ  +  M  ,E ,G ,,r4c vtd|jD}|sStt|jDcgc]*\}}t |t s|j ds)|,c}}rttk(sJsdgtzfd}|Scc}}w)Nc3bK|]'}t|txr|jd)yw)rXN)rr$rs)rrs rrz)wrap_handle_tensordesc..s)rX[jc2Ss~~l7SSrs-/rXct|dt}d}t|tdD]>\}}|vr$|jt |||dz }.|j |@|S)Nrr})r_BASE_ARGS_FORMAT_LENrextendrr)args final_argsrrrlaunchertensordesc_indicesrs rinnerz%wrap_handle_tensordesc..inners$5 567 %:%; <= 'FAs&&!!"5c?>;Z"[\!#!!#&  ' $$r4)r%rsetrrr$rsr)rrrhas_tensor_desc_argrrrrs` ` @rwrap_handle_tensordescrsr_h_o_o_qrr "9#3#3#56pvq#*S#:NSVSaSabnSopr#o"6#>P:Q"QQ Q &3'9#:: % L! qs B5 "B5 4B5 ceZdZdZdZy) CudaLauncherctdr jn t}fd}|jDcic]\}}|||}}}jjDcic]\}}|| }}}t |dd}t |||tdttt} tjtj|jd|_t#| j$|||_|j&|_|j(|_|j*|_|j,|_|j.|_|j0|_ycc}}wcc}}w)Nrctt|tr&jjj |fS|Sr1)rr$fn arg_namesindex)rrBs rz'CudaLauncher.__init__..s-Z3=OSVV--33A69UVr4r__triton_launcherrAr})r:rdictrrgetattrrrr5rDrE functoolsreduceoperatormul cluster_dimsnum_ctasrlaunchglobal_scratch_sizeglobal_scratch_alignprofile_scratch_sizeprofile_scratch_alignlaunch_cooperative_grid launch_pdl) rMrBrrarg_idxidxvaluerrrNs ` rrOzCudaLauncher.__init__s7%,S+%>CMMDF V;D??;LMZS%WS\5(M M25--2E2E2GHJCS%ZH H!(,=tDIy/B%$%%  "((x7L7LaP ,SZZOT #+#?#? $,$A$A!$,$A$A!%-%C%C"'/'G'G$"--'NHs E0/ E6c .fd}|jjtj}|jj tj } j|jj|| g |y)Ncx|dkDr4zz}| jz|z}|j}||| SyNr)rget) sizealign allocator grid_size alloc_sizealloc_fngridXgridYgridZrMstreams rallocate_scratchz/CudaLauncher.__call__..allocate_scratchsHax!EME1 &6= $==? E6::r4) rrr _allocatorrr_profile_allocatorrr r ) rMrrrrfunctionrrglobal_scratchprofile_scratchs ````` r__call__zCudaLauncher.__call__s  *$*B*BDD]D]_j_u_uv*4+D+DdF`F`+6+I+IK E5%4;W;WY]YhYh"O <6: s rrOzCudaDriver.__init__s[ ( r4c~|j}|j|}|ddz|dz}d}td||S)Nrrr} r )get_current_deviceget_device_capabilityr)rMdevice capability warp_sizes rget_current_targetzCudaDriver.get_current_targetsK((*//7 ]R'*Q-7  Y77r4cJddl}|jd|jS)Nrr )torchr+r)rMr0s rget_active_torch_devicez"CudaDriver.get_active_torch_devices||FD$;$;$=>>r4c"ddl}|jSr)r0r r1s rget_device_interfacezCudaDriver.get_device_interfaceszzr4c ddl}|jjxr|jjduS#t $rYywxYw)NrF)r0r is_availableversionhip ImportError)r0s r is_activezCudaDriver.is_activesC  ::**,L%--2C2Ct2K L  s7: AArtreturnct|Sr1)ru)rMrts rmap_python_to_cpp_typez!CudaDriver.map_python_to_cpp_types }r4cddlm}|S)Nr)do_bench)triton.testingr?)rMr?s rget_benchmarkerzCudaDriver.get_benchmarkers +r4cbddl}d}|jt|dz|jdS)Nrir )rr+)r0emptyint)rMr0 cache_sizes rget_empty_cache_for_benchmarkz(CudaDriver.get_empty_cache_for_benchmarks. ' {{3zQ/uyy{PPr4c$|jyr1)zero_)rMcaches r clear_cachezCudaDriver.clear_caches  r4)rPrQrRrOr.r2r4 staticmethodr:r$r=rArGrKrSrTs@rr$r$sN 8?Qr4r$)-rrrrrrpathlibrrtriton.runtime.buildrtriton.runtimertriton.backends.compilerrtriton.backends.driverrrr"realpath__file__rrDr2rErG lru_cacher/r5objectr7rurrrrrrrrrrrrr$r3r4rrVsy  8&., ''//"''**84 5 Wi01  We, H  .,,;;: 4            $-.Wv :b ::  $-N2(<6(