L i ddlZddlZddlZddlZddlZddlmZmZddlm Z ddl m Z ddl m Z ddlmZmZmZddlmZddlmZddlZddlmcmZdd lmZdd lmZmZm Z m!Z!m"Z"dd l#m$Z$dd l%m&Z&dd l'm(Z(m)Z)ddl*m+Z+m,Z,gdZ-dZ.Gddej^Z0dZ1GddeZ2GddZ3Gdde Z4ddddde5de5de5de5de5d e5d!efd"Z6d#e5d!e4fd$Z7 d-d%e8d&ee8d'e9fd(Z:Gd)d*e3Z;Gd+d,e2ZtjjS)a Returns a set of supported profiler tracing activities. Note: profiler uses CUPTI library to trace on-device CUDA kernels. In case when CUDA is enabled but CUPTI is not available, passing ``ProfilerActivity.CUDA`` to profiler results in using the legacy CUDA profiling code (same as in the legacy ``torch.autograd.profiler``). This, in turn, results in including CUDA time in the profiler table output, but not in the JSON trace. )torchautograd_supported_activitiesr7r8r2rr@s >> / / 11r8c@eZdZdZedZedZedZy)_ITraceObserverz^Abstract interface for a Trace observer. This satisfies 3 methods: start, stop and cleanupcyNr7r/s r2startz_ITraceObserver.startR r8cyr@r7rAs r2stopz_ITraceObserver.stopVrCr8cyr@r7rAs r2cleanupz_ITraceObserver.cleanupZrCr8N)r3r4r5r6rrBrErGr7r8r2r>r>NsC9      r8r>cpeZdZdZddddddddddd deeedededed ed ed eed ee d edee ge fddfdZ d*dZ d*dZd*dZd*dZd*dZde fdZd+de de fdZdedeeddfdZ d,dededefdZd Zd!e d"e ddfd#Zd!e d"e ddfd$Zd!e d"e ddfd%Zd&Zdefd'Zd-de d(ee ddfd)Zy)._KinetoProfilea Low-level profiler wrap the autograd profile Args: activities (iterable): list of activity groups (CPU, CUDA) to use in profiling, supported values: ``torch.profiler.ProfilerActivity.CPU``, ``torch.profiler.ProfilerActivity.CUDA``, ``torch.profiler.ProfilerActivity.XPU``. Default value: ProfilerActivity.CPU and (when available) ProfilerActivity.CUDA or (when available) ProfilerActivity.XPU. record_shapes (bool): save information about operator's input shapes. profile_memory (bool): track tensor memory allocation/deallocation (see ``export_memory_timeline`` for more details). with_stack (bool): record source information (file and line number) for the ops. with_flops (bool): use formula to estimate the FLOPS of specific operators (matrix multiplication and 2D convolution). with_modules (bool): record module hierarchy (including function names) corresponding to the callstack of the op. e.g. If module A's forward call's module B's forward which contains an aten::add op, then aten::add's module hierarchy is A.B Note that this support exist, at the moment, only for TorchScript models and not eager mode models. experimental_config (_ExperimentalConfig) : A set of experimental options used by profiler libraries like Kineto. Note, backward compatibility is not guaranteed. execution_trace_observer (ExecutionTraceObserver) : A PyTorch Execution Trace Observer object. `PyTorch Execution Traces `__ offer a graph based representation of AI/ML workloads and enable replay benchmarks, simulators, and emulators. When this argument is included the observer start() and stop() will be called for the same time window as PyTorch profiler. acc_events (bool): Enable the accumulation of FunctionEvents across multiple profiling cycles .. note:: This API is experimental and subject to change in the future. Enabling shape and stack tracing results in additional overhead. When record_shapes=True is specified, profiler will temporarily hold references to the tensors; that may further prevent certain optimizations that depend on the reference count and introduce extra tensor copies. NF activities record_shapesprofile_memory with_stack with_flops with_modulesexperimental_configexecution_trace_observer acc_eventscustom_trace_id_callbackrKrLrMrNrOrPrQrRrSrTreturnc |r t|n t|_||_||_||_||_||_||_||_ | |_ | |_ d|_ d|_ d|_d|_t j"|jvrd|_i|_yt j$|jvrd|_i|_yt j&|jvrd|_i|_yt j(|jvrd|_i|_yt j*|jvrt-|_i|_y)NFcudaxpumtiahpu)setrrKrLrOrMrNrPrQrRrSrTprofilerhas_cudagraphsmem_tl use_devicerCUDAXPUMTIAHPU PrivateUse1r preset_metadata) r/rKrLrMrNrOrPrQrRrSrTs r2__init__z_KinetoProfile.__init__s3.8#j/=Q=S*$,$(#6 (@%$(@%04 #7;   DOO 3$DO02 ! !T__ 4#DO02 " "doo 5$DO02  ! !T__ 4#DO 02  ) )T__ <;=DO02r8cD|j|jyr@) prepare_trace start_tracerAs r2rBz_KinetoProfile.starts  r8c$|jyr@) stop_tracerAs r2rEz_KinetoProfile.stops  r8c ttdr$ddlmcm}|j j |_|j |jstjtj|jv|j|j |j"|j$|j&|j(d|j*|j|j, |_|jj/y)N _inductorrT) use_cpur_rLrOrMrNrP use_kinetorQrSrT)hasattrr:torch._inductor.configrmconfigtriton cudagraphsr]r\rSprofrrCPUrKr_rLrOrMrNrPrQrT_prepare_trace)r/inductor_configs r2rhz_KinetoProfile.prepare_traces 5+ & < <"1"8"8"C"CD  MM !4?? LL)--@??"00??#22??!..$($<$<??)-)F)F DM $$&r8c|jr|jj|jJ|jj|jr|j dd|j r|j dd|jr|j dd|jr|j dd|jr|j ddtr|j}|r+|j dtj|td}tt d r'd d lm}|t't j(d d }|j*rI|r|dks t-s8dt.j0d<|j dddt.j0d<|j2j5D]\}}|j ||yy)NrM1rNrLrPrOdistributedInfoclsversionr) TorchVersionrWz0.0z12.6DISABLE_CUPTI_LAZY_REINIT0TEARDOWN_CUPTI)rRrBr\ _start_tracerMadd_metadata_jsonrNrLrPrOr_get_distributed_infor%dumpsr!rpr:torch.torch_versionrgetattrr~r]rosenvironreitems)r/ dist_info cuda_versionrkvs r2riz_KinetoProfile.start_traces  ( (  ) ) / / 1}}((( ""$     " "#3S 9 ??  " "< 5     " "?C 8     " ">3 7 ??  " "< 5  224I&&%tzz)'O Lui(<+GEMM65,QR "","7HJ:= 67&&'BCH 03 +,,,224 -1&&q!, -5 r8c|jr|jj|jJ|jjdddyr@)rRrEr\__exit__rAs r2rkz_KinetoProfile.stop_tracesC  ( (  ) ) . . 0}}((( tT40r8pathc |jsJ|jdrtjddd}|j |jj |j }t|j d5}tj|d5}|j|ddddddtj|j |S|jj |S#1swYMxYw#1swYQxYw) z Exports the collected trace in Chrome JSON format. If kineto is enabled, only last cycle in schedule is exported. .gzw+b.jsonFsuffixdeleterbwbN) r\endswithtempfileNamedTemporaryFilecloseexport_chrome_tracenameopengzip writelinesrremove)r/rfpretvaluefinfouts r2rz"_KinetoProfile.export_chrome_traces }}} == ,,U75QB HHJ}}88AHbggt$ )YYtT*)dOOC() ) IIbgg O==44T: : )) ) )s$DC8,D8D =DD metriccV|jsJ|jj||S)zSave stack traces to a file Args: path (str): save stacks file to this location; metric (str): metric to use: "self_cpu_time_total" or "self_cuda_time_total" )r\ export_stacks)r/rrs r2rz_KinetoProfile.export_stackss'}}}}}**488r8enablecV|jsy|jj||y)aToggle collection of activities on/off at any point of collection. Currently supports toggling Torch Ops (CPU) and CUDA activity supported in Kineto Args: activities (iterable): list of activity groups to use in profiling, supported values: ``torch.profiler.ProfilerActivity.CPU``, ``torch.profiler.ProfilerActivity.CUDA`` Examples: .. code-block:: python with torch.profiler.profile( activities=[ torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA, ] ) as p: code_to_profile_0() // turn off collection of all CUDA activity p.toggle_collection_dynamic(False, [torch.profiler.ProfilerActivity.CUDA]) code_to_profile_1() // turn on collection of all CUDA activity p.toggle_collection_dynamic(True, [torch.profiler.ProfilerActivity.CUDA]) code_to_profile_2() print(p.key_averages().table( sort_by="self_cuda_time_total", row_limit=-1)) N)r\toggle_collection_dynamic)r/rrKs r2rz(_KinetoProfile.toggle_collection_dynamics":}}  // Cr8group_by_input_shapegroup_by_stack_ngroup_by_overload_namecX|jsJ|jj|||S)aAverages events, grouping them by operator name and (optionally) input shapes, stack and overload name. .. note:: To use shape/stack functionality make sure to set record_shapes/with_stack when creating profiler context manager. )r\ key_averages)r/rrrs r2rz_KinetoProfile.key_averages>s0}}}}})) "24J  r8cJ|jsJ|jjS)z Returns the list of unaggregated profiler events, to be used in the trace callback or after the profiling is finished )r\function_eventsrAs r2eventsz_KinetoProfile.eventsPs }}}}},,,r8keyvaluectd|jddzdz}tjj||y)zo Adds a user defined metadata with a string key and a string value into the trace file "z\"N)replacer:r;_add_metadata_json)r/rr wrapped_values r2 add_metadataz_KinetoProfile.add_metadataXs2 emmC77#=  ))#}=r8cDtjj||y)zs Adds a user defined metadata with a string key and a valid json value into the trace file N)r:r;rr/rrs r2rz _KinetoProfile.add_metadata_json`s ))#u5r8c"||j|<y)z Preset a user defined metadata when the profiler is not started and added into the trace file later. Metadata is in the format of a string key and a valid json value N)rers r2preset_metadata_jsonz#_KinetoProfile.preset_metadata_jsongs %*S!r8cddlm}|jr|jsy|j }||j |j |j|jjd}|dk(rCtjjj}djd|D|d<|S)Nr)backendrank world_sizepg_count pg_confignccl.c32K|]}t|ywr@)str).0rs r2 z7_KinetoProfile._get_distributed_info..s0NAQ0Ns nccl_version)torch.distributed distributed is_availableis_initialized get_backendget_rankget_world_size get_pg_countdistributed_c10d_get_all_pg_configsr:rWrr~join)r/distrrrs r2rz$_KinetoProfile._get_distributed_infoos(  "$*=*=*?""$MMO--/))+..BBD   f  ::??224L(+0N0N(NIn %r8cd}|Dcgc]}t||r|d}}|rtdj|d|j|jjJt |jjScc}w)N)rLrMrNz=Truez, z required for memory profiling.)r ValueErrorrr\kineto_resultsr)r/requiredimissings r2_memory_profilez_KinetoProfile._memory_profilesD(0I1a8HaS;II  ' 233RST T}}(T]]-I-I-UUUT]]99:: Js BBdevicec|M|jr|jdk7r|jdz}n"tjjrdnd}t |j |_|jdr|j j||y|jdrtjdd d }|j|jd r'|j j|j|n&|j j|j|t|j5}t!j|d 5}|j#|ddddddt%j&|jy|j j||y#1swYNxYw#1swYRxYw)aoExport memory event information from the profiler collected tree for a given device, and export a timeline plot. There are 3 exportable files using ``export_memory_timeline``, each controlled by the ``path``'s suffix. - For an HTML compatible plot, use the suffix ``.html``, and a memory timeline plot will be embedded as a PNG file in the HTML file. - For plot points consisting of ``[times, [sizes by category]]``, where ``times`` are timestamps and ``sizes`` are memory usage for each category. The memory timeline plot will be saved a JSON (``.json``) or gzipped JSON (``.json.gz``) depending on the suffix. - For raw memory points, use the suffix ``.raw.json.gz``. Each raw memory event will consist of ``(timestamp, action, numbytes, category)``, where ``action`` is one of ``[PREEXISTING, CREATE, INCREMENT_VERSION, DESTROY]``, and ``category`` is one of the enums from ``torch.profiler._memory_profiler.Category``. Output: Memory timeline written as gzipped JSON, JSON, or HTML. NrWz:0zcuda:0cpuz.htmlrw+trFrz raw.json.gzwt)r_r:rWrrrr^rexport_memory_timeline_htmlrrrexport_memory_timeline_rawrexport_memory_timelinerrrrr)r/rrrrrs r2rz%_KinetoProfile.export_memory_timelinesV. >4??f#<4/%*ZZ%<%<%>E,D,@,@,BC  == ! KK 3 3D& A ]]5 !,,U75QB HHJ}}]+ 66rwwG 22277FCbgg )#YYtT*)dOOC() ) IIbgg  KK . .tV < )) ) )s$G F>1G >G G  GrUN)self_cpu_time_total)FrFr@)r3r4r5r6r rrboolrr>r rrfrBrErhrirkrrrr*rrrrrrrrrr7r8r2rIrI_s%T<@#$  "=A>B @D(2X&678(2 (2  (2  (2(2(2&&9:(2#+?";(2(2#+8BG+<"=(2 (2T'*,-\1 ;;$9#9s9DD(01A(BD DF&+ !',  "  !%  $->>C>D>6S666**C*D*&;;0=30= 0=QU0=r8rIc eZdZdZdZdZdZdZy)rzG Profiler actions that can be taken at the specified intervals rN)r3r4r5r6NONEWARMUPRECORDRECORD_AND_SAVEr7r8r2rrs D F FOr8r)repeat skip_firstskip_first_waitwaitwarmupactiverrrrUcdtdtffd }dk\rdk\rdkDr dk\rdk\sJddk(r td|S)a  Returns a callable that can be used as profiler ``schedule`` argument. The profiler will skip the first ``skip_first`` steps, then wait for ``wait`` steps, then do the warmup for the next ``warmup`` steps, then do the active recording for the next ``active`` steps and then repeat the cycle starting with ``wait`` steps. The optional number of cycles is specified with the ``repeat`` parameter, the zero value means that the cycles will continue until the profiling is finished. The ``skip_first_wait`` parameter controls whether the first ``wait`` stage should be skipped. This can be useful if a user wants to wait longer than ``skip_first`` between cycles, but not for the first profile. For example, if ``skip_first`` is 10 and ``wait`` is 20, the first cycle will wait 10 + 20 = 30 steps before warmup if ``skip_first_wait`` is zero, but will wait only 10 steps if ``skip_first_wait`` is non-zero. All subsequent cycles will then wait 20 steps between the last active and warmup. steprUcX|dk\sJ|krtjS|z}dk7r|z }zz}dkDr||z k\rtjS||z}|krtjS|zkrtjS||dz krtjStjS)Nrr)rrrrr) r num_stepsmod_steprrrrrrs r2 schedule_fnzschedule..schedule_fnsqyy * !&& & J D a  DLD6MF* A:$*f4!&& &)# d?!&& & v %!(( (i!m+%% $33 r8rz#Invalid profiler schedule argumentsz>Profiler won't be using warmup, this can skew profiler results)r*rr )rrrrrrrs`````` r2rrsc0#.2  fkfqjVq[ZST_-,-T { MN r8_c"tjS)zy Default profiler behavior - immediately starts recording the events, keeps doing it on every profiler step. )rr)rs r2_default_schedule_fnrs   r8dir_name worker_nameuse_gzipc4ddlddldfd }|S)a  Outputs tracing files to directory of ``dir_name``, then that directory can be directly delivered to tensorboard as logdir. ``worker_name`` should be unique for each worker in distributed scenario, it will be set to '[hostname]_[pid]' by default. rNctjjs tjds'j dtjdjd}r|dz}|jtjj|y#t$r}t dz|d}~wwxYw)NT)exist_okzCan't create directory: rrz.pt.trace.jsonr) rrisdirmakedirs Exception RuntimeError gethostnamegetpidtime_nsrr)rue file_namersockettimerrs r2 handler_fnz-tensorboard_trace_handler..handler_fnsww}}X& Q Ht4#//12!BIIK=AK"m1T\\^$4NC !E)I   h !BC Q"#=#HIqP QsB00 C 9CC r)rr)rrrrrrs``` @@r2rr s D D r8c2eZdZdZdddddddddddddd deeedeeege fdeede fd e d e d e d e d e dee dee de dee deegefddffdZdZdZddZddZddZddZdZddZddZdeej4fdZxZS) raProfiler context manager. Args: activities (iterable): list of activity groups (CPU, CUDA) to use in profiling, supported values: ``torch.profiler.ProfilerActivity.CPU``, ``torch.profiler.ProfilerActivity.CUDA``, ``torch.profiler.ProfilerActivity.XPU``. Default value: ProfilerActivity.CPU and (when available) ProfilerActivity.CUDA or (when available) ProfilerActivity.XPU. schedule (Callable): callable that takes step (int) as a single parameter and returns ``ProfilerAction`` value that specifies the profiler action to perform at each step. on_trace_ready (Callable): callable that is called at each step when ``schedule`` returns ``ProfilerAction.RECORD_AND_SAVE`` during the profiling. record_shapes (bool): save information about operator's input shapes. profile_memory (bool): track tensor memory allocation/deallocation. with_stack (bool): record source information (file and line number) for the ops. with_flops (bool): use formula to estimate the FLOPs (floating point operations) of specific operators (matrix multiplication and 2D convolution). with_modules (bool): record module hierarchy (including function names) corresponding to the callstack of the op. e.g. If module A's forward call's module B's forward which contains an aten::add op, then aten::add's module hierarchy is A.B Note that this support exist, at the moment, only for TorchScript models and not eager mode models. experimental_config (_ExperimentalConfig) : A set of experimental options used for Kineto library features. Note, backward compatibility is not guaranteed. execution_trace_observer (ExecutionTraceObserver) : A PyTorch Execution Trace Observer object. `PyTorch Execution Traces `__ offer a graph based representation of AI/ML workloads and enable replay benchmarks, simulators, and emulators. When this argument is included the observer start() and stop() will be called for the same time window as PyTorch profiler. See the examples section below for a code sample. acc_events (bool): Enable the accumulation of FunctionEvents across multiple profiling cycles use_cuda (bool): .. deprecated:: 1.8.1 use ``activities`` instead. .. note:: Use :func:`~torch.profiler.schedule` to generate the callable schedule. Non-default schedules are useful when profiling long training jobs and allow the user to obtain multiple traces at the different iterations of the training process. The default schedule simply records all the events continuously for the duration of the context manager. .. note:: Use :func:`~torch.profiler.tensorboard_trace_handler` to generate result files for TensorBoard: ``on_trace_ready=torch.profiler.tensorboard_trace_handler(dir_name)`` After profiling, result files can be found in the specified directory. Use the command: ``tensorboard --logdir dir_name`` to see the results in TensorBoard. For more information, see `PyTorch Profiler TensorBoard Plugin `__ .. note:: Enabling shape and stack tracing results in additional overhead. When record_shapes=True is specified, profiler will temporarily hold references to the tensors; that may further prevent certain optimizations that depend on the reference count and introduce extra tensor copies. Examples: .. code-block:: python with torch.profiler.profile( activities=[ torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA, ] ) as p: code_to_profile() print(p.key_averages().table(sort_by="self_cuda_time_total", row_limit=-1)) Using the profiler's ``schedule``, ``on_trace_ready`` and ``step`` functions: .. code-block:: python # Non-default profiler schedule allows user to turn profiler on and off # on different iterations of the training loop; # trace_handler is called every time a new trace becomes available def trace_handler(prof): print( prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=-1) ) # prof.export_chrome_trace("/tmp/test_trace_" + str(prof.step_num) + ".json") with torch.profiler.profile( activities=[ torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA, ], # In this example with wait=1, warmup=1, active=2, repeat=1, # profiler will skip the first step/iteration, # start warming up on the second, record # the third and the forth iterations, # after which the trace will become available # and on_trace_ready (when set) is called; # the cycle repeats starting with the next step schedule=torch.profiler.schedule(wait=1, warmup=1, active=2, repeat=1), on_trace_ready=trace_handler, # on_trace_ready=torch.profiler.tensorboard_trace_handler('./log') # used when outputting for tensorboard ) as p: for iter in range(N): code_iteration_to_profile(iter) # send a signal to the profiler that the next iteration has started p.step() The following sample shows how to setup up an Execution Trace Observer (`execution_trace_observer`) .. code-block:: python with torch.profiler.profile( ... execution_trace_observer=( ExecutionTraceObserver().register_callback("./execution_trace.json") ), ) as p: for iter in range(N): code_iteration_to_profile(iter) p.step() You can also refer to test_execution_trace_with_kineto() in tests/profiler/test_profiler.py. Note: One can also pass any object satisfying the _ITraceObserver interface. NF) rKron_trace_readyrLrMrNrOrPrQrRrSuse_cudarTrKrr.rLrMrNrOrPrQrRrSrrTrUc  |r t|n t}| etdtd| r |j t j n1t j |vr|jt j t|dkDsJdt|)||||||| | r| ntj| |  |r||_ d|_nt|_ d|_||_d|_|j|j"|_d|_it(j*t(j*fgt(j*t(j,f|j.gt(j*t(j0f|j.|j2gt(j*t(j4f|j.|j2gt(j,t(j*ft7td |j2|j8gt(j,t(j,fgt(j,t(j0f|j2gt(j,t(j4f|j2gt(j0t(j*ft7td |j8gt(j0t(j,ft7td |j8gt(j0t(j0fgt(j0t(j4fgt(j4t(j*f|j8|j:gt(j4t(j,f|j8|j:|j.gt(j4t(j0f|j8|j:|j.|j2gt(j4t(j4f|j8|j:|j.|j2gt(j,df|j2|j8gt(j0df|j8|j:gt(j4df|j8|j:gi|_t>j@jCtDy) Nz;`use_cuda` is deprecated, use `activities` argument insteadr) stacklevelrz"No valid profiler activities foundrJTFz+Incorrect schedule: WARMUP followed by NONEz+Incorrect schedule: RECORD followed by NONEz-Incorrect schedule: RECORD followed by WARMUP)#r[rr FutureWarningaddrr`rlensuperrfr"build_execution_trace_obs_from_envr record_stepsrrstep_numcurrent_action step_rec_fnrrrrhrrirrrk _trace_ready action_mapruKinetoStepTrackerinit_step_countPROFILER_STEP_NAME)r/rKrrrLrMrNrOrPrQrRrSrrTactivities_set __class__s r2rfzprofile.__init__s$-7Z"Q&L(LL& !')!!% 3'&>'JJL!%=   $DM $D 0DM %D , "mmDMM:;?:  ."5"5 6:  ."7"7 84;M;M:N:  ."7"7 8""  ; :  ."@"@ A""  D:  " "N$7$7 8KL  ;: " " "N$9$9 :B#: $ " "N$9$9 :T=M=M ::>>-r 2 KBJJNN+CRH  " " 1 12D E   #33#dmm"44 D     & & ( r8c||_y)zP Sets a callback to be called when a new trace ID is generated. N)rT)r/callbacks r2set_custom_trace_id_callbackz$profile.set_custom_trace_id_callbackRs )1%r8cH|jy|jjS)z/ Returns the current trace ID. N)r\trace_idrAs r2 get_trace_idzprofile.get_trace_idXs! == }}%%%r8c@|jr|j|yyr@)rrAs r2r&zprofile._trace_ready`s       % r8c`|jj||f}|r|D] }| yyr@)r'r=)r/r?r$ action_listactions r2r6zprofile._transit_actionds7oo));*GH %   r8cH|jy|jjSr@)r\_statsrAs r2rKzprofile._statsjs == }}###r8r)r3r4r5r6r rrr r*rrrrr>rrfr.rrBrErrBrEr&r6ru_ProfilerStatsrK __classcell__)r,s@r2rr'so@J<@>B7;#$  "=A>B #'@D!yCX&678yC8SE>$9:; yC !#s(!34 yC  yCyCyCyCyC&&9:yC#+?";yCyC4.yC #+8BG+<"=!yC" #yCv4 )8 ),1 && $!4!45$r8rceZdZdZddZddZededfdZddZ de de fd Z ddee fd Z e dd edee fd Zdd ZedZdZddZddZddZdee fdZddZy)raExecution Trace Observer Each process can have a single ExecutionTraceObserver instance. The observer can be added to record function callbacks via calling register_callback() explicitly. Without calling unregister_callback(), repeated calls to register_callback() will not add additional observers to record function callbacks. Once an ExecutionTraceObserver is created, the start() and stop() methods control when the event data is recorded. Deleting or calling unregister_callback() will remove the observer from the record function callbacks, finalize the output file, and will stop incurring any overheads. rUNcXd|_d|_d|_d|_d|_d|_y)z1 Initializes the default states. Fr;N) _registered_execution_trace_runningextra_resources_collection resources_diroutput_file_pathoutput_file_path_observerrAs r2rfzExecutionTraceObserver.__init__s4!(-%*/'"$%'.0&r8c$|jyzO Calls unregister_callback() to make sure to finalize outputs. Nunregister_callbackrAs r2__del__zExecutionTraceObserver.__del__   "r8ctjjdddk(r tjddd}|jt}|j|jtjjd ddk(r|jd |S|jd|Sy #t $r}t d|Yd }~y d }~wwxYw) a Returns an ExecutionTraceObserver instance if the environment variable ENABLE_PYTORCH_EXECUTION_TRACE is set to 1, otherwise returns None. Configures the observer to also collect extra resources if the environment variable ``ENABLE_PYTORCH_EXECUTION_TRACE_EXTRAS=1``. These are resources such as generated kernels, index tensor data etc. that are required to make the Execution Trace replayable. ENABLE_PYTORCH_EXECUTION_TRACErrzrz.et.jsonFrzTExecution trace will not be recorded. Exception on creating default temporary file: N%ENABLE_PYTORCH_EXECUTION_TRACE_EXTRAST) rrr=rrrr rrregister_callbackrset_extra_resource_collection)rrets r2r!z9ExecutionTraceObserver.build_execution_trace_obs_from_envs ::>>:C @C G 00zRWX HHJ')B   )zz~~EsKsR006I007I jkljmn  sB<< CCCcN||_|jr|jdy)aB Collects extra resources such as generated kernels, index tensor data, and any other metadata that is required to complete the Execution Trace content. The caller should call this method with val=True after calling register_callback() if they want to collect the extra resources. T) can_createN)rRget_resources_dir)r/vals r2r`z4ExecutionTraceObserver.set_extra_resource_collections)+.'  * *  " "d " 3r8rTcdtfd}|js6||_|jdr|}||_t ||_|S)zv Adds ET observer to record function callbacks. The data will be written to output_file_path. rUcjtjddd}|j|jS)NrrFr)rrrr)rs r2get_temp_uncompressed_filezLExecutionTraceObserver.register_callback..get_temp_uncompressed_files(,,U75QB HHJ77Nr8r)rrPrTrrUr)r/rTrhs r2r_z(ExecutionTraceObserver.register_callbacksS  C  $4D !((/#=#? -=D *<=MND  r8c|jsy|jr |jStj|j|}|sy||_|jS)a Generates the resources directory for the generated kernels, or index tensor data or any other metadata that is required to complete the Execution Trace content. The directory is created right where the ET file is being output. Only works if the observer has called set_extra_resource_collection(val=True). Returns None if the observer is not configured with extra resource collection. N) create_dir)rRrSrget_resources_dir_for_et_pathrT)r/rcgenerated_paths r2rdz(ExecutionTraceObserver.get_resources_dirsb..   %% %/MM  ! !jN +!!!r8rjc~tjj|\}}tjj|tjj |ddz}tjj |s|r tj ||Sy|S#t$rtd|YywxYw)Nr _resourcesz(Execution trace exception when creating ) rrsplitrsplitextexistsmkdirrr ) trace_pathrjwork_dirr resource_dirs r2rkz4ExecutionTraceObserver.get_resources_dir_for_et_paths!ggmmJ7)ww|| bgg&&y1!4|C ww~~l+ HH\*  ! CL>RS sB""B<;B<cZd fd }dtdtddfd}jrbj |t j jdr|jj d _yy#t$r}t d|Yd}~gd}~wwxYw) zE Removes ET observer from record function callbacks. rUNc j}|syddlm}|j Dcgc]}t |dd |j}}|D]Z}|tjj|}tjj||}tj||\y#t$r}td|Yd}~yd}~wwxYwcc}w)Nz>Execution trace exception when generating resource directory: r) PyCodeCache__file__)rdrr torch._inductor.codecacherxmodulesrryrrbasenamershutilcopyfile) rurrxr kernel_files kernel_filerdstr/s r2_save_triton_kernelszHExecutionTraceObserver.unregister_callback.._save_triton_kernelss #557   M%,,1j$/; L , 2 &ww'' 4ggll<6 S1  2# TUVTWX  sB, C, C 5CC uncompressed_file output_filec td|d|t|d5}tj|d5}|j|ddddddt j |y#1swY'xYw#1swY+xYw)NzExecution Trace: compressing z to rr)printrrrrr)rrrrs r2 _save_gz_filezAExecutionTraceObserver.unregister_callback.._save_gz_files| 12C1DD V W'. )#YY{D1)TOOC() ) II' ()) ) )s"A9A-A9-A6 2A99Bz(Execution trace failed to save kernels: gzFr) rrPrErr rrTrrU)r/rrrs` r2rYz*ExecutionTraceObserver.unregister_callbacks  26 )S )s )t )    IIK E$& - .$$--d3d<S>ST$D    E?sCDD EsB B*B%%B*c|jS)z^ Returns True if the execution trace observer is registered, otherwise False. )rPrAs r2 is_registeredz$ExecutionTraceObserver.is_registered/s r8c|jS)zK Returns True if the observer is running, otherwise False. )rQrAs r2 is_runningz!ExecutionTraceObserver.is_running6s,,,r8cz|jr/|js"td|_|jyyy)z$ Starts to capture. TN)rPrQr_record_pg_configrAs r2rBzExecutionTraceObserver.start<s8   D$A$A , .,0D )  " " $%B r8c@|jrtd|_yy)z# Stops to capture. FN)rQrrAs r2rEzExecutionTraceObserver.stopEs   ( ( - /,1D ) )r8c$|jyrWrXrAs r2rGzExecutionTraceObserver.cleanupMr[r8c4|jr |jSy)z7 Returns the output file name or None. N)rTrAs r2get_output_file_pathz+ExecutionTraceObserver.get_output_file_pathSs  (( (r8ch|jrtjjrtjj rhtjj j j}tjjdtj|tyyyy)Nz## process_group:init ##r|) rr:rrrr_worldpg_config_infor; _record_function_with_args_enterr%rr!)r/rs r2rz(ExecutionTraceObserver._record_pg_config\s   !!..0!!002"..??FFUUN NN ; ;* >}= 31 r8r)F)r3r4r5r6rfrZ staticmethodr r!r`rr r_rdrrkrYpropertyrrrBrErGrrr7r8r2rrps  1# 9Q0R8 #$&"Xc]"4', $ #$3%j  - %2# hsm r8r)NF)=rr%rr}rabcrrcollections.abcrenumr functoolsrtypingrr r typing_extensionsr warningsr r:torch.autograd.profilerr;r\rutorch._Cr torch._C._profilerrrrrrtorch._environmentrtorch._utils_internalrtorch.autogradrrtorch.profiler._memory_profilerrr__all__r*r&r!rr>rIrr*rrrrrrrr7r8r2rsH #$**" &&2)S=P $7D$$7, 2 c "\=\=~ T 5 5 5  5  5  555p!C!N!HM ( @D