K i>UddlmZmZmZddlmZmZmZmZddl m Z ddl m Z ddl mZddlZddlmZmZmZmZddlmZddlZddlZddlZddlZddlZddlZdd lmZd efd Zd e j@fd Z!ejDdZ#ejDd e$fdZ%de$fdZ&ejDde$fdZ'ejDddZ(de$fdZ)edGddZ*GddeZ+y)) BaseBackend GPUTargetLanguage)irpassesllvmnvidia)knobs) PTXASError) dataclassN)AnyDictTupleOptional) ModuleType)Pathtargetc>dttttffd}|S)Nreturnc|jj}|jj}||k(sJd|dk(ryy)Nz%lhs and rhs bitwidth must be the same)r )rr)scalarprimitive_bitwidth)lhs_typerhs_type lhs_bitwidth rhs_bitwidths e/mnt/ssd/data/python-lab/Trading/venv/lib/python3.12/site-packages/triton/backends/nvidia/compiler.pycheck_dot_compatibilityz-min_dot_size..check_dot_compatibilitysB99 99 |+T-TT+ 1 )rint)rr"s r! min_dot_sizer%s!uS#s]7K #"r#rc6tjjSN)r r ptxasr#r! get_ptxasr*"s <<  r#ctjj}||Stjt j dgjd}|S)Nz --versionutf-8)r r mock_ptx_version subprocess check_outputr*pathdecode)mock_verversions r!get_ptxas_versionr4&sI||,,H%%y{'7'7&EFMMgVG Nr#ct|tsJtt|j d\}}|dk(r|dkrd|zSd|zdz S|dk(rd|zS|dk(rd |zS|d k\rd }||d z dzz|zSt d |z) zK Get the highest PTX version supported by the current CUDA driver. . Pr F ? Zz?Triton only support CUDA 10.0 or higher, but got CUDA version: ) isinstancestrmapr$split RuntimeError) cuda_versionmajorminorbase_ptxs r!ptx_get_versionrI/s lC (( (sL..s34LE5 { 19: :> ! {Ez {Ez {52:++e33 X[gg hhr#archc`|j}|tj}t|}|Sr') ptx_versionr*r3rI)optionsrJrLrEs r!get_ptx_version_from_optionsrNGs0%%K {** %l3 r#c@t||}td|}d|}|S)NVz+ptx)rNmin)rMrJrLllvm_ptx_versionfeaturess r! get_featuresrTOs0.w=K2{+&'(H Or#ct|d5}tj|jj cdddS#1swYyxYw)Nrb)openhashlibsha256read hexdigest)r0fs r! file_hashr]]s> dD 4Q~~affh'113444s 1AA capabilityc"|dk\rdnd}d||S)Nr?asm_r))r^suffixs r!sm_arch_from_capabilityrdcs!"$S"F  VH %%r#T)frozenczeZdZUdZeed<dZeed<dZeed<dZeed<d Z e eed <d Z e ed <d Z eed <d Zeed<d Ze eed<dZeed<dZeed<dZeed<dZeeed<dZeeed<dZeed<dZeeed<d Zeed<d Zeed<dZeed<d Zeed!<dZeed"<d Z eed#<d$Z!eed%<d&Z"d'Z#y )( CUDAOptions num_warpsrnum_ctas num_stagesr warp_sizeNmaxnreg)rrr cluster_dimsrL ptx_options ir_overrideTenable_fp_fusionFlaunch_cooperative_grid launch_pdl)fp8e5fp8e4b15supported_fp8_dtypesr)!deprecated_fp8_dot_operand_dtypestf32default_dot_input_precision)rytf32x3ieeeallowed_dot_input_precisionsmax_num_imprecise_acc_default extern_libsdebugcuda backend_namesanitize_overflowrJrainstrumentation_modecttjdz }|jint |j}|j dds-t jjxst|dz |d<tj|dt|j|jdkDr|j|jdz zdk(sJdy)Nlib libdevicezlibdevice.10.bcrrrznum_warps must be a power of 2)r__file__parentrdictgetr r libdevice_pathrAobject __setattr__tupleitemsri)selfdefault_libdirrs r! __post_init__zCUDAOptions.__post_init__sh..6 ,,4b$t?O?O:P {D1',||'B'B'mc.[lJlFmK $4k6G6G6I0JK~~!t~~!9K'LQR&R 0/ 0R&Rr#c ^t|j}tdt|dD|d<dj t|j Dcgc] \}}|d|c}}}t j|jdjScc}}w)Nc3<K|]\}}|t|fywr')r]).0kvs r! z#CUDAOptions.hash..s(htq!!Yq\):(hsr_-r,) r__dict__rsortedjoinrrXrYencoder[)r hash_dictnamevalkeys r!hashzCUDAOptions.hashs' #((hviXeNfGg(h#h - hh @Q9RSID#4&#ST~~cjj12<<>>TsB) )$__name__ __module__ __qualname__rir$__annotations__rjrlrmrnrrorrLrprArqrrboolrsrtrwrrxrzr}r~rrrrrrJrrrr)r#r!rgrgisIsHcJIs"GXc]!#L%#KK!%K#%!d!$)T)J'<%*<46%uSz6'--/I %*I*.!4.KE4L#"t"D# "#"0?r#rgceZdZdZedefdZdZdefdZ deddffd Z de fdZ d Z d Zdeeeffd Zd Zed ZedZdZdZdZdZdZej6dZxZS) CUDABackendNrc |jdk(S)Nr)backend)rs r!supports_targetzCUDABackend.supports_targets~~''r#cd}tj||}|std|t|j dS)Nz ^sm(\d+)$z(TRITON_OVERRIDE_ARCH must have the form r)re fullmatch ValueErrorr$group)rrJpatternmatchs r! _parse_archzCUDABackend._parse_archs@ Wd+GyQR R5;;q>""r#rcB|j|j}d|S)Ncuda:)rrJ)rrMr^s r!get_target_namezCUDABackend.get_target_names#%%gll3 zl##r#c2t||d|_y)Ncubin)super__init__ binary_ext)rr __class__s r!rzCUDABackend.__init__s  !r#cdtjjxsd|jji}|j t jjDcic]}||vs|||||c}t|j|d}|jdddkDr|dkrtd|dd|vrFtt j}|d k\r|jd t!t#||d<d |vr |dk\rd |d <d |vrtj$j&|d <|dk(rdnd|d<t di|Scc}w)NrJsmrjrr?zBnum_ctas > 1 requires NVIDIA SM90+ (Hopper). Current target is sm_zM. This configuration will fail. Please set num_ctas=1 or target an SM90+ GPU.rwYfp8e4nvrx)rvrri@rr~r))r runtime override_archrrJupdaterg__dataclass_fields__keysr$rrrsetrwaddrrlanguagedefault_fp_fusion)roptsargsrr^rws r! parse_optionszCUDABackend.parse_optionsso 33NDKKjA|tjjC|tj>jA|tj&jE|t jjFjI||jJ|tj&jM||jJtj&jO|tj&jQ||jJ|n|dzdk\rtj&j=|tj>jA|tjjC|tj&jS|tj&jU|d t jj,jW|tj&jM||jJtj&jO|tj&jY||jJtj&jQ||jJ|tj&jE|tj&jU|d t jj,j[|ntjjC|tj>jA|tjj;|tj&j]|tj&j7||d k\tj&j_|t jj,ja|tj&j1|t jj,jc|tj&je|tj&jg|tjj;|tj>ji||dzd k\r)t jj,jk|t jj,jm||t jj,jo|tj>jq|tj>js|tj>jA||ju||j|j|jf|d<|jw}||d<|S)Nz ttg.maxnregrrrrrr<rr9)rrFTrrotensordesc_meta)44R833B7,,R044R800Z25EF @@D &&r*  v % NN 0 0 4 MM + +B / KK ' ' + MM + +B / NN ; ;B ? MM 4 4R V NN / /CNN C NN - -b 1 NN ' 'CNNL I 2  # NN 0 0 4 MM + +B / KK ' ' + NN 8 8 < NN / /E : MM # # ; ;B ? NN / /CNN C NN - -b 1 NN . .r3>> B NN ' 'CNNL I NN ; ;B ? NN / /D 9 MM # # : :2 > KK ' ' + ''+ &&r*##B'00Z25EF..r2 99"=44R8 33B72226//3 &&r* $$R(  q MM # # 4 4R 8 33B C --b1 r" b! ''+ s $0$<$V>VXdXpXp#q 557&5"# r#c$|}tj|j}|jtj j |tj j|tjj|tjj|tj j|tjj||j||j!|d<|S)Nr)rrrrrgluonradd_resolve_auto_encodingsrr#rr rrrrr$)rsrcrrMr^rrs r!gluon_to_ttgirzCUDABackend.gluon_to_ttgirEs __S[[ )    $ //3 r" &&r* &&r*77; s &)&A&A&C"# r#c t||jj}|}tj|j }|j tjj|tjj|tjj|tjjj|||tjjj!|t"j$j&rtjj)|tjj+|tjjj-||t.j0r+t.j0j3d||j tjjj5|||tj6j9|tj6j;|tjjj=|tjjj?|tj6j9|tj6j;|tj6jA|tjjC|t"j$jDstjFjI|t.j0r+t.j0j3d||j |jK|tMjNtMj }t"j$jPr tSdtMjT||} tW|} tY||jj} d} tjZtMj\| | | | tj^| |j`rItjb| r4|j`D cgc]\} }| }} }tMjd| |tMjf| tLjh|jkd}|||d<|jkd|d<|jkd |d <|jkd |d <|jkd |d<|jkdxsd|d<|jkdxsd|d<tm| }~ ~|Scc}} w)Nttgpuir_to_llvmirllvmir_to_llvmzYAddress Sanitizer Error: Address sanitizer is currently only supported on the AMD backendnvptx64-nvidia-cudazttg.total-num-warpsriz ttg.sharedrzttg.tensor_memory_size tmem_sizezttg.global_scratch_memory_sizeglobal_scratch_sizez#ttg.global_scratch_memory_alignmentglobal_scratch_alignzttg.profile_scratch_memory_sizerprofile_scratch_sizez$ttg.profile_scratch_memory_alignmentrprofile_scratch_align)7rNrrJrrrrrrradd_allocate_warp_groupsconvert add_scf_to_cfr add_allocate_shared_memory_nvradd_allocate_tensor_memoryr compilationenable_experimental_consanadd_concurrency_sanitizer"add_allocate_global_scratch_memoryadd_proxy_fence_insertionrrpatch add_to_llvmirrrradd_nvgpu_to_llvmadd_warp_specialize_to_llvmradd_nvvm_to_llvmdisable_line_infollvmir add_di_scoperr init_targets enable_asanrD to_modulerdrT set_short_ptrattach_datalayoutset_nvvm_reflect_ftzrhas_extern_depslink_extern_libsoptimize_module OPTIMIZE_O3 get_int_attrrA)rr+rrMr^rLrrrllvm_modprocrStriplerr0pathstotal_num_warpsrets r! make_llirzCUDABackend.make_llirUs27DKK    7 7 NN 4 4R 899"= 99"jI  & &  ' ' - -.A2s{{ S ++B KH ''+ b! 11"5 ;;B? ''+ b! $$R(''+  22 MM & &r *  & &  ' ' - -.>CKK P s  ,,.    ( (km m>>#w/&z2)9)9:& xx@##H-   6#9#9(#C.5.A.ABltTTBEB  ! !(E 2 Xt'7'78**+@A  &$3H[ ! --l; # 0 01I J*-*:*:;[*\&'+.+;+;*:; ff*i },EsRTR^R^_ff';zl)CSPRP\P\]ff+R5 << " " 4 5 #J r#c tj}tjddd5}tjddd5}|j ||j |j dz}g} tjjr| dd gz } n'tjjr| d gz } n| dgz } |jrgnd g} t|} tjjrd d gng} |jr|jjdng} |g| | d| | d| |j d|} t!j"|dd|tjj$r7t'|j 5}t)|j+dddt,jj/|j rt-j0|j t,jj/|j rt-j0|j t'|d5}|j+}dddt,jj/|rt-j0|ddddddS#1swYxYw#t j2$r}t'|j 5}|j+}dddn #1swYnxYwt,jj/|j rt-j0|j |j4dk(rd}n2|j4dt6j8zk(rd}nd|j4}|dddj;|d}t)d|d|dt=|d}~wwxYw#1swYwxYw#1swYHxYw#1swYSxYw) NFwz.ptx)deletemodercrz.logz.oz -lineinfoz-suppress-debug-infoz-gz --fmad=falsez --opt-level0 z-vz --gpu-name=z-oT)check close_fdsstderrz!Internal Triton PTX codegen errorz`ptxas` raised SIGSEGVz`ptxas` failed with error code z `ptxas` stderr: z Repro command:  zC ================================================================ z zy ================================================================ please share the reproducer above with Triton project. rV)r*r0tempfileNamedTemporaryFilewriteflushrr r;rEr disable_ptxas_optrrrdrprCr.rdump_ptxas_logrWrbrZosexistsremoveCalledProcessError returncodesignalSIGSEGVrr )rr+rrr^r(fsrcflogfbin debug_infofmadrJ disable_optptx_extra_options ptxas_cmdlog_fileelogerrorr\rs r! make_cubinzCUDABackend.make_cubins     ( (C OG SW  ' 'u3v NG RV JJsO JJL99t#DJ  22{,BCC //tf$ {m+ --2N3CD*:6D38,,2P2P=#.VXK?Boo 5 5c :SU "%)+/2=@QU`ae`fSgimirirI$ (ydS<<..dii/Hhmmo./77>>$)),IIdii(77>>$)),IIdii(:dD! !Q !ww~~d# $OG G P O//00 ($))_*"--/C***77>>$)),IIdii(<<3&?E\\S6>>%994E=all^LE!7#--0E2++.88I+>*?rC !''5 (8 ! !IG G G P sO0DO# AJ<J/,BJ< O#O!z(CUDABackend.add_stages..s4>>#xQXZd3er#rc,j||Sr')r'rs r!rz(CUDABackend.add_stages..sDOOCSZ\f4gr#ttgirc,j||Sr')r,rs r!rz(CUDABackend.add_stages..sD4G4GXW^`j4kr#c,j||Sr')rYrs r!rz(CUDABackend.add_stages..st~~c8WV`/ar#llircTj||jjSr')rdrrJr+rrMrs r!rz(CUDABackend.add_stages..s#dmmC7TXT_T_TdTd.er#ptxcTj||jjSr')rrrJrs r!rz(CUDABackend.add_stages..s#XwX\XcXcXhXh0ir#r)rrJrTRITONGLUON)rstagesrMrr^s` ` @r! add_stageszCUDABackend.add_stagesse%%gll3 x &eF6NgF7O  'kF7Oaveu iwr#cLt}|d|jjS)Nr)r4rrJ)rr3s r!rzCUDABackend.hash s&#%!DKK,,-..r#)rrrr staticmethodrrrrArrr rrrrrrrrr'r,rYrdrr functools lru_cacher __classcell__)rs@r!rrsO( ((#$#$"y"T"#S#6 >S*_ 5>;  MM^ FP,JX jY//r#r),triton.backends.compilerrrrtriton._C.libtritonrrrr tritonr triton.runtime.errorsr dataclassesr rtypingr rrrtypesrrXrrrr}rxr.pathlibrr% NvidiaToolr*rr4r$rIrNrTr]rdrgrr)r#r!rs-EE88,!--   # #5##iSii.   T44 &&  $)?)?)?Xw/+w/r#