K iJfddlmZejd dZejd dZej d dZej d dZej d dZej d dZ ej d d Z ej d d Z ej d d Z y))coreNc Vtjddgtjdd|S)Nzmov.u64 $0, %globaltimer;z=lFdtypeis_purepack _semantic)rinline_asm_elementwiseint64r s f/mnt/ssd/data/python-lab/Trading/venv/lib/python3.12/site-packages/triton/language/extra/cuda/utils.py globaltimerrs.  & &'BD"TXT^T^hmtu1: <<c Vtjddgtjdd|S)Nzmov.u32 $0, %smid;z=rTrr)rr int32r s rsmidr s+  & &';T2TZZaelm1: <>)++33==B CCrchtj|jjjSNrr s rrrs# >>)++33== >>rc Xtjdd|gtjdd|S)Na{ .reg .b32 a<2>, b<2>; prmt.b32 a0, 0, $2, 0x5746; and.b32 b0, a0, 0x7f007f00; and.b32 b1, a0, 0x00ff00ff; and.b32 a1, a0, 0x00800080; shr.b32 b0, b0, 1; add.u32 b1, b1, a1; lop3.b32 $0, b0, 0x80008000, a0, 0xf8; shl.b32 $1, b1, 7; } z=r,=r,rTr)rr float16)argr s rconvert_fp8e4b15_to_float16r"!s2  & & 46?T\\cgno  rc d}|r|dz }n|dz }|dz }tj|d|gtjdd|S) NaN{ .reg .pred p<4>; .reg .b32 a<2>, b<2>; .reg .b16 c<4>; .reg .b16 max_val_f16; .reg .b32 max_val_f16x2; mov.b16 max_val_f16, 0x3F00; mov.b32 max_val_f16x2, 0x3F003F00; and.b32 a0, $1, 0x7fff7fff; and.b32 a1, $2, 0x7fff7fff;zSmin.f16x2 a0, a0, max_val_f16x2; min.f16x2 a1, a1, max_val_f16x2;asetp.lt.f16x2 p0|p1, a0, max_val_f16x2; setp.lt.f16x2 p2|p3, a1, max_val_f16x2; mov.b32 {c0, c1}, a0; mov.b32 {c2, c3}, a1; selp.b16 c0, c0, max_val_f16, p0; selp.b16 c1, c1, max_val_f16, p1; selp.b16 c2, c2, max_val_f16, p2; selp.b16 c3, c3, max_val_f16, p3; mov.b32 a0, {c0, c1}; mov.b32 a1, {c2, c3};zmad.lo.u32 a0, a0, 2, 0x00800080; mad.lo.u32 a1, a1, 2, 0x00800080; lop3.b32 b0, $1, 0x80008000, a0, 0xea; lop3.b32 b1, $2, 0x80008000, a1, 0xea; prmt.b32 $0, b0, b1, 0x7531; }z=r,r,rTrr)rr float8e4b15)r! has_minx2r asms rconvert_float16_to_fp8e4b15r'2sm +C 6 6 + + C  & &sHse4CSCS]ahi1: <rAs < < < < DD??    <