K iGiddlZddlZddlZddlZddlmZddlmZddlm Z ddl m Z ddl m Z ddlmZddlmZej$j'ej$j)eZej$j-ed gZd Zej2d ZGd d eZdZddddddZddddddZdZ dZ!dZ"GddeZ#Gdde Z$y)N)Path)knobs) GPUTarget) GPUDriver) _allocation)compile_module_from_src)TensorDescriptorincludec ddl}|jdk7ryddl ddlm}m}m}m m m}G fdd j} j|||||||} jdj}| g|_||_d j!dz} fd } ||| | r$t#j$ j'| Sy#t$rYywxYw) NrLinux)c_charc_intc_size_tc_void_pc_char_pPOINTERc"eZdZdWfdWfgZy)8_find_already_mmapped_dylib_on_linux..DlPhdrInfo dlpi_addr dlpi_nameN)__name__ __module__ __qualname___fields_)rrs`/mnt/ssd/data/python-lab/Trading/venv/lib/python3.12/site-packages/triton/backends/amd/driver.py DlPhdrInfors ( # ( # rz libc.so.6ic |jj}ttj|}|j vr'j ||tt|yy)Nrr) contentsrrosfsdecodenamememmoveminlen)infosizedatarpctypeslib_namemax_path_lengths rcallbackz6_find_already_mmapped_dylib_on_linux..callback3sRMM++ Y' ( qvv  NN4CY,P Qr)platformsystemr+r rrrrr Structure CFUNCTYPECDLLdl_iterate_phdr Exceptionargtypesrestypecreate_string_bufferr!r" string_at)r,r/r rrrr callback_tr4pathr.rrr+r-s` @@@@r$_find_already_mmapped_dylib_on_linuxr<sG# KK V%% !!%)OQXY_Q`aJ ++k2BB!+H5O#OO  & &': ;Dz(+T2{{6++D122 + s1C-- C98C9c d}tjjx}rC|j|r!tj j |r|Std|d|t|}|r2tj j |r|Std|d|g}tj jtj jtd|}tj j |r|S|j|ddl }|j}|j}|j r|g|z}|D]X}tj j|dd|}tj j |r|cS|j|Zt j"d } | rj| j%d D]V} tj j| |} tj j | r| cS|j| Xt j"d } | rStj j| d|} tj j | r| S|j|  t'j(d d gj+j-}|rStj j|d|} tj j | r| S|j| t j"d}|rStj j|d|}tj j |r|S|j|t'j(ddgj+d}|j3Dcgc]5}|j-j|s#|j%d7}}|D]6}tj j |r|cS|j|8tj jd|}tj j |r|S|j|td|d|#t&j.t0f$rYwxYwcc}w)Nzlibamdhip64.sozTRITON_LIBHIP_PATH 'z' does not point to a valid zmemory mapped 'z'' in process does not point to a valid librtorchLD_LIBRARY_PATH:HIP_PATH hipconfigz--path ROCM_PATHz/sbin/ldconfigz-pignore)errorsz/opt/rocm/lib/zcannot locate z after attempted paths )ramd libhip_pathendswithr!r;exists RuntimeErrorr<joindirname__file__appendsitegetsitepackagesgetusersitepackagesENABLE_USER_SITEgetenvsplit subprocess check_outputdecodestripCalledProcessErrorFileNotFoundError splitlines)r,env_libhip_path mmapped_pathpaths local_librQ site_packages user_siter;env_ld_library_pathdf env_hip_path hip_lib_pathhip_root env_rocm_path rocm_lib_pathlibslinelocsloccommon_install_paths r_get_path_to_hip_runtime_dylibrqAsH ))////  # #H -"''..2Q" "1/1BB^_g^hijj8AL 77>>, ' _\N:abjaklmm E RWW__X6xHI ww~~i  LL((*M((*I " m3 ww||D'5(; 77>>$ K T ))$56$**3/ A Q)Aww~~a  LLO  99Z(Lww||L%B 77>>, '  \"  **K+BCJJLRRT 77<<%BLww~~l+## LL & IIk*M ]E8D 77>>- (  ]#  " "$4d#; < C C8 C TD*.): ^djjl>S>ST\>]DJJL  ^D ^ 77>># J S '',,'7B ww~~)*"" LL$% z1HP QQ;  ) )+< =   " _s%A6R R$R;&R;R87R8c$eZdZfdZdZxZS)HIPUtilscdt|dstt|||_|jS)Ninstance)hasattrsuperrs__new__ru)cls __class__s rrxzHIPUtils.__new__s*sJ' 37%>"r)rrrrxr __classcell__rzs@rrsrss  ?rrsc>|ddk(rydddddddd d d d d d d d d |S)Nr*hipDeviceptr_tint8_tint16_tint32_tint64_tuint8_tuint16_tuint32_tuint64_tdouble)i1i8i16i32i64u1u8u16u32u64fp16bf16fp32f32fp64)tys r ty_to_cpprsQ !u|   !  rrrr)rrrrr pack_fp16 pack_bf16 pack_fp32 pack_fp64 piiiKKOOOOOc"d}fdfdfdt||jDcic]\}}|| }}}dj|jDcgc] }| c}}t|z}djt |j}t t t|jd}t|D cic]\} }| | }} }t|dkDr)ddjd |jDznd} g} |jD]P\} }|d k(r |tvr| jt|d | 2| jt|d | Rdj| } g} |jD][\} }|dd k(r| jd | d$|tvr| jd| dB|d k7sH| jd| ]|jD cgc])\} }|tvrt|d| dt|d| d+}} }t}t t!t|}|jD cgc]\} }|d k7s d| }} }|jd|jdd|dt| dkDrd| zndddj|d|d|ddj|jD cgc]\} }|d| dc}} d |d!| d"dj|d#dj|jD cgc]\} }|dd k(rd$| d%| d| d&| d' nd c}} d(t| dkDrddj| zndd)}|Scc}}wcc}wcc}} wcc}} wcc}} wcc}} wcc}} w)*Ncg}|D]}t|tr|jdr|jddz}t j d|j }|jd|ztd|zD]}|jd|jdt|D]}|jd t|D]}|jd|j||S) N tensordesc,rztensordesc<([^[>]*)rrrr) isinstancestr startswithcountrematchgrouprPrange) signatureoutputsigndimdtype_s r_expand_signaturez(make_launcher.._expand_signatures #C#s#|(Dyy~)!6<BBD cEk*q4x)AMM%() d# t)AMM%()t)AMM%() c"' #* rc^t|trdjt|S|S)Nr)rtuplerMmap)r_serialize_signatures rrz+make_launcher.._serialize_signatures) c5 !88C 4c:; ; rct|tr!djt|}d|dS|ddk(ry|dk(ryt |S)Nr[]rrz PyObject* constexprrrrMrr)rval_extracted_types rrz&make_launcher.._extracted_typesS b% ((334Cse1:  a5C<  }rc t|tr!djt|}d|dS|ddk(ry|dk(rydd d d d d ddddd t |S)N()rrOrrelbhiLBHIK) rlongrrrrrrrrr)rr format_ofs rrz make_launcher..format_of s b% ''#i,-Cse1:  a5C<     B-  rrrrz, c3,K|] \}}d|yw)z&_argNr).0rrs r z make_launcher..%s LB5 Lsrz argrptr_infoz.dev_ptr_arg_storagez _argz _storage = z(_argz);z&argz&global_scratchz&profile_scratcha\ #define __HIP_PLATFORM_AMD__ #include #include #include #include #include #include // The list of paths to search for the HIP runtime library. The caller Python // code should substitute the search path placeholder. static const char *hipLibSearchPaths[] = {"a"}; // The list of HIP dynamic library symbols and their signature we are interested // in this file. #define HIP_SYMBOL_LIST(FOR_EACH_ERR_FN, FOR_EACH_STR_FN) \ FOR_EACH_STR_FN(hipGetLastError) \ FOR_EACH_STR_FN(hipGetErrorString, hipError_t hipError) \ FOR_EACH_ERR_FN(hipModuleLaunchKernel, hipFunction_t f, \ unsigned int gridDimX, unsigned int gridDimY, \ unsigned int gridDimZ, unsigned int blockDimX, \ unsigned int blockDimY, unsigned int blockDimZ, \ unsigned int sharedMemBytes, hipStream_t stream, \ void **kernelParams, void **extra) \ FOR_EACH_ERR_FN(hipModuleLaunchCooperativeKernel, hipFunction_t f, \ unsigned int gridDimX, unsigned int gridDimY, \ unsigned int gridDimZ, unsigned int blockDimX, \ unsigned int blockDimY, unsigned int blockDimZ, \ unsigned int sharedMemBytes, hipStream_t stream, \ void **kernelParams, void **extra) \ FOR_EACH_ERR_FN(hipPointerGetAttribute, void *data, \ hipPointer_attribute attribute, hipDeviceptr_t ptr) // The HIP symbol table for holding resolved dynamic library symbols. struct HIPSymbolTable { #define DEFINE_EACH_ERR_FIELD(hipSymbolName, ...) \ hipError_t (*hipSymbolName)(__VA_ARGS__); #define DEFINE_EACH_STR_FIELD(hipSymbolName, ...) \ const char *(*hipSymbolName)(__VA_ARGS__); HIP_SYMBOL_LIST(DEFINE_EACH_ERR_FIELD, DEFINE_EACH_STR_FIELD) }; static struct HIPSymbolTable hipSymbolTable; bool initSymbolTable() { // Use the HIP runtime library loaded into the existing process if it exits. void *lib = dlopen("libamdhip64.so", RTLD_NOLOAD); // Otherwise, go through the list of search paths to dlopen the first HIP // driver library. if (!lib) { int n = sizeof(hipLibSearchPaths) / sizeof(hipLibSearchPaths[0]); for (int i = 0; i < n; ++i) { void *handle = dlopen(hipLibSearchPaths[i], RTLD_LAZY | RTLD_LOCAL); if (handle) { lib = handle; } } } if (!lib) { PyErr_SetString(PyExc_RuntimeError, "cannot open libamdhip64.so"); return false; } typedef hipError_t (*hipGetProcAddress_fn)( const char *symbol, void **pfn, int hipVersion, uint64_t hipFlags, hipDriverProcAddressQueryResult *symbolStatus); hipGetProcAddress_fn hipGetProcAddress; dlerror(); // Clear existing errors const char *error = NULL; *(void **)&hipGetProcAddress = dlsym(lib, "hipGetProcAddress"); error = dlerror(); if (error) { PyErr_SetString(PyExc_RuntimeError, "cannot query 'hipGetProcAddress' from libamdhip64.so"); dlclose(lib); return false; } // Resolve all symbols we are interested in. int hipVersion = HIP_VERSION; uint64_t hipFlags = 0; hipDriverProcAddressQueryResult symbolStatus; hipError_t status = hipSuccess; #define QUERY_EACH_FN(hipSymbolName, ...) status = hipGetProcAddress(#hipSymbolName, (void **)&hipSymbolTable.hipSymbolName, hipVersion, hipFlags, &symbolStatus); if (status != hipSuccess) { PyErr_SetString(PyExc_RuntimeError, "cannot get address for '" #hipSymbolName "' from libamdhip64.so"); dlclose(lib); return false; } HIP_SYMBOL_LIST(QUERY_EACH_FN, QUERY_EACH_FN) return true; } static inline void gpuAssert(hipError_t code, const char *file, int line) { if (code != HIP_SUCCESS) { const char* prefix = "Triton Error [HIP]: "; const char* str = hipSymbolTable.hipGetErrorString(code); char err[1024] = {0}; snprintf(err, 1024, "%s Code: %d, Messsage: %s", prefix, code, str ); PyErr_SetString(PyExc_RuntimeError, err); } } #define HIP_CHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); } static void _launch(int gridX, int gridY, int gridZ, int num_warps, int num_ctas, int launch_cooperative_grid, int clusterDimX, int clusterDimY, int clusterDimZ, int shared_memory, hipStream_t stream, hipFunction_t function, hipDeviceptr_t profile_scratchz>) { hipDeviceptr_t global_scratch = 0; void *params[] = { z }; if (gridX*gridY*gridZ > 0 && launch_cooperative_grid) { HIP_CHECK(hipSymbolTable.hipModuleLaunchCooperativeKernel(function, gridX, gridY, gridZ, z*num_warps, 1, 1, shared_memory, stream, params, 0)); return; } if (gridX*gridY*gridZ > 0) { HIP_CHECK(hipSymbolTable.hipModuleLaunchKernel(function, gridX, gridY, gridZ, ae *num_warps, 1, 1, shared_memory, stream, params, 0)); } } typedef struct _DevicePtrInfo { hipDeviceptr_t dev_ptr; bool valid; } DevicePtrInfo; static PyObject* data_ptr_str = NULL; static inline DevicePtrInfo getPointer(PyObject *obj, int idx) { DevicePtrInfo ptr_info; hipError_t status = hipSuccess; ptr_info.dev_ptr = 0; ptr_info.valid = true; if (PyLong_Check(obj)) { ptr_info.dev_ptr = (hipDeviceptr_t)PyLong_AsUnsignedLongLong(obj); return ptr_info; } if (obj == Py_None) { // valid nullptr return ptr_info; } PyObject *ret = PyObject_CallMethodNoArgs(obj, data_ptr_str); if (!ret) { PyErr_SetString(PyExc_TypeError, "Pointer argument must be either uint64 or have data_ptr method"); ptr_info.valid = false; goto cleanup; } if (!PyLong_Check(ret)) { PyErr_SetString(PyExc_TypeError, "data_ptr method of Pointer object must return 64-bit int"); ptr_info.valid = false; goto cleanup; } ptr_info.dev_ptr = (hipDeviceptr_t)PyLong_AsUnsignedLongLong(ret); if (!ptr_info.dev_ptr) goto cleanup; uint64_t dev_ptr; status = hipSymbolTable.hipPointerGetAttribute(&dev_ptr, HIP_POINTER_ATTRIBUTE_DEVICE_POINTER, ptr_info.dev_ptr); if (status == hipErrorInvalidValue) { PyErr_Format(PyExc_ValueError, "Pointer argument (at %d) cannot be accessed from Triton (cpu tensor?)", idx); ptr_info.valid = false; // Clear and ignore HIP error (void)hipSymbolTable.hipGetLastError(); } ptr_info.dev_ptr = (hipDeviceptr_t)dev_ptr; cleanup: Py_DECREF(ret); return ptr_info; } static uint16_t pack_fp16(double f) { uint16_t result; // from https://github.com/python/pythoncapi-compat/blob/5e317108f872c904eb726cb8d560dcadbdf88a72/pythoncapi_compat.h#L482-L492 #if 0x030600B1 <= PY_VERSION_HEX && PY_VERSION_HEX <= 0x030B00A1 && !defined(PYPY_VERSION) _PyFloat_Pack2(f, (unsigned char*)&result, 1); #else PyFloat_Pack2(f, (char*)&result, 1); #endif return result; } static uint16_t pack_bf16(double f) { float f32 = (float)f; uint32_t u32 = *(uint32_t*)&f32; return (uint16_t)(u32 >> 16); } static uint32_t pack_fp32(double f) { float f32 = (float)f; return *(uint32_t*)&f32; } static uint64_t pack_fp64(double f) { return *(uint64_t*)&f; } static PyObject* launch(PyObject* self, PyObject* args) { int gridX, gridY, gridZ; uint64_t _stream; uint64_t _function; int launch_cooperative_grid; PyObject *profile_scratch_obj = NULL; PyObject *launch_enter_hook = NULL; PyObject *launch_exit_hook = NULL; PyObject *kernel_metadata = NULL; PyObject *launch_metadata = NULL;  z; z if(!PyArg_ParseTuple(args, "a,", &launch_cooperative_grid, &gridX, &gridY, &gridZ, &_stream, &_function, &profile_scratch_obj, &kernel_metadata, &launch_metadata, &launch_enter_hook, &launch_exit_hook z)) { return NULL; } a // extract kernel metadata int num_warps, num_ctas, shared_memory, clusterDimX, clusterDimY, clusterDimZ; if (!PyArg_ParseTuple(kernel_metadata, "iiiiii", &num_warps, &num_ctas, &shared_memory, &clusterDimX, &clusterDimY, &clusterDimZ)) { return NULL; } // extract launch metadata if (launch_enter_hook != Py_None){ PyObject* ret = PyObject_CallOneArg(launch_enter_hook, launch_metadata); if (!ret) return NULL; Py_DECREF(ret); } hipDeviceptr_t profile_scratch = 0; if (profile_scratch_obj != Py_None) { DevicePtrInfo profile_scratch_info = getPointer(profile_scratch_obj, -1); if (!profile_scratch_info.valid) { return NULL; } profile_scratch = profile_scratch_info.dev_ptr; } // raise exception asap zDevicePtrInfo ptr_infoz = getPointer(_argz); if (!ptr_infoz.valid) return NULL;z; _launch(gridX, gridY, gridZ, num_warps, num_ctas, launch_cooperative_grid, clusterDimX, clusterDimY, clusterDimZ, shared_memory, (hipStream_t)_stream, (hipFunction_t)_function, (hipDeviceptr_t)profile_scratcha); if(launch_exit_hook != Py_None){ PyObject* ret = PyObject_CallOneArg(launch_exit_hook, launch_metadata); if (!ret) return NULL; Py_DECREF(ret); } if(PyErr_Occurred()) { return NULL; } Py_RETURN_NONE; } static PyMethodDef ModuleMethods[] = { {"launch", launch, METH_VARARGS, "Entry point for all kernels with this signature"}, {NULL, NULL, 0, NULL} // sentinel }; static struct PyModuleDef ModuleDef = { PyModuleDef_HEAD_INIT, "__triton_launcher", NULL, //documentation -1, //size ModuleMethods }; PyMODINIT_FUNC PyInit___triton_launcher(void) { if (!initSymbolTable()) { return NULL; } PyObject *m = PyModule_Create(&ModuleDef); if(m == NULL) { return NULL; } data_ptr_str = PyUnicode_InternFromString("data_ptr"); if(data_ptr_str == NULL) { return NULL; } PyModule_AddFunctions(m, ModuleMethods); return m; } ) enumeratevaluesrM_BASE_ARGS_FORMATrlistfilterboolrVr&itemsFLOAT_STORAGE_TYPErPrFLOAT_PACK_FUNCTIONrqr) constantsr warp_sizeridxsr args_formatformatr args_list arg_decl_list arg_declsinternal_args_listfloat_storage_declsrIparamsr~rrrs @@@r make_launcherrs6 *'00A)BRBRBT0U&VWFCaWIW''93C3C3EFR9R=FGK  ,F193C3C3EFGIVD)//#"678I"+I"67$!QA7I7PST]P^abPbtyy L)//:K LLLhjIM"<2    # #  $6r$:#;4s!C D  IbM?$qc!: ; < -(I"22 a5C<  % %8&< = % %  % %QCx&8 9 ;   % %QCj 1 2__& Ar # # b ! "%s+6I"6M5NeTUSVVXY 12K%I' (F&/oo&7 MUQ2;LQCj MF M MM#$ MM$% -.9Mi:@RUXYbUcfgUgAEHQAQmo@ppyy()*^^g]hiST]R]Y^r88Y__=N OEAr#$E!B / OPQR &x(RS\Q\]88  !"299R[RaRaRcdINIJLNoqrsotx{o{&qc);A3bCSTUSVVjkBDDdeffS|@R|SVW|WTX[_[d[dew[xTx]_S`+`ob CF JY XF8,Np PBds/ O%O # O%.O OOO# 1#O)0cfd}|S)zN Replace all tensor descriptors with the base ptr, shape, and strides cl|dtt}|ttd}g}|D]}t|tr\|j |j g|j |j|jdk(|j |jo|j|g||S)Nnan) r&rrr extendbaseshapestridespaddingrP)args meta_argsraw_kernel_args final_argsarglaunchers rinnerz,wrap_handle_tensor_descriptor..innerrs0#/01 s#4567 " 'C#/0!!388"vcii"v#++"vs{{V[G["v^a^g^g"vjmjuju"vw!!#& '00Z00rr)rrs` rwrap_handle_tensor_descriptorrms 1" LrceZdZdZdZy) HIPLauncherc`tdr jn t}fd}|jDcic]\}}|||}}}jjDcic]\}}|| }}}t |||j tdt}td|jD} | rt|jn |j|_ |j|_ |j|_|j|_ycc}}wcc}}w)Nrctt|tr&jjj |fS|SN)rrfn arg_namesindex)xr~s rz&HIPLauncher.__init__..s-Z3=OSVV--33A69UVr__triton_launcherr}c3bK|]'}t|txr|jd)yw)rN)rrr)rrs rrz'HIPLauncher.__init__..s)!v\_*S#"6"W3>>,;W"W!vs-/)rvrdictrrrrrranyrrlaunchlaunch_cooperative_gridprofile_scratch_sizeprofile_scratch_align) rr~metadatararg_idxrvaluerrhas_tensor_desc_args ` rrzHIPLauncher.__init__s%,S+%>CMMDF V;D??;LMZS%WS\5(M M25--2E2E2GHJCS%ZH HIy(2D2DE%#4GVbc!!vclcscscu!vvCV3CJJ?\_\f\f '/'G'G$$,$A$A!%-%C%C"NHs D$/ D*c fd}||j|jtj}|j|j ||g|y)Nc^|dkDr'zz}||z}|j}||| SyNr)get) r(align allocator grid_size alloc_sizealloc_fngridXgridYgridZstreams rallocate_scratchz.HIPLauncher.__call__..allocate_scratchs?ax!EME1 &- $==? E6::r)rrr_profile_allocatorrr) rr'r(r)r*functionrr+profile_scratchs ```` r__call__zHIPLauncher.__call__sW +4+D+DdF`F`+6+I+IK  D00%vxYhpkoprN)rrrrr/rrrr r s D qrr cbeZdZfdZdZedZdedefdZdZ dZ d Z d Z d Z xZS) HIPDrivercVt|t|_t|_yr )rwrrsutilsr  launcher_cls)rrzs rrzHIPDriver.__init__s Z 'rc"ddl}|jSr )r?cudarr?s rget_device_interfacezHIPDriver.get_device_interfaceszzrc ddl}|jjxr|jjduS#t $rYywxYw)NrF)r?r6 is_availableversionhip ImportError)r?s r is_activezHIPDriver.is_activesC  ::**,P%--2C2C42O P  s7: AArreturnct|Sr )r)rrs rmap_python_to_cpp_typez HIPDriver.map_python_to_cpp_types }rc|j}|jj|}tjj xs|d}|d}t d|jdd|S)NarchwarpSizer<rAr)get_current_devicer3rrruntime override_archrrV)rdevicedevice_propertiesrCrs rget_current_targetzHIPDriver.get_current_targetse((* JJ<>rcJddl}|jd|jS)Nrr6)r?rHrEr7s rget_active_torch_devicez!HIPDriver.get_active_torch_devices||FD$;$;$=>>rcddlm}|S)Nr)do_bench)triton.testingrN)rrNs rget_benchmarkerzHIPDriver.get_benchmarkers +rcbddl}d}|jt|dz|jdS)Nrir6)rrH)r?emptyint)rr? cache_sizes rget_empty_cache_for_benchmarkz'HIPDriver.get_empty_cache_for_benchmarks.' {{3zQ/uyy{PPrc$|jyr )zero_)rcaches r clear_cachezHIPDriver.clear_caches  r)rrrrr8 staticmethodr>rrArJrLrPrVrZrrs@rr1r1sN( ?? Qrr1)% functoolsr!rWrpathlibrtritonrtriton.backends.compilerrtriton.backends.driverrtriton.runtimertriton.runtime.buildrtriton.tools.tensor_descriptorr r;rNrealpathrOrMrr< lru_cacherqobjectrsrrrrrrr r1rrrrgs  .,&8; ''//"''**84 5 Wi01 -`\R\R~?v?( .            "M` 2q&q@. .r