L i?ddlZddlZddlZddlZddlZddlZddlmZddlm Z ddl m Z ddl m Z ddlmZmZmZmZmZmZddlZddlmZddlmZddlmZd d lmZmZm Z m!Z!d d l"m#Z#m$Z$d d gZ%edZ&edZ'de(ee&e!fde)e*e!ffdZ+ d0de)e,efdeejZde.e,fdZ/ d0de)e,efdeejZddfdZ0GddZ1dede#defdZ2dejfde#dejffdZ4de$de#defdZ5d ee*d!ee*de(e*fd"Z6d ee*d!ee*de(e*fd#Z7Gd$d%ejpZ9d&ejpd'e*d(e*dejpfd)Z:d*e,d+e*de,fd,Z;d-Zy)1N)Sequence)contextmanager)wraps)Stats)AnyCallablecastOptionalTypeVarUnion) ShardedTensor)Shard)_is_wrapped_exception_wrap_exceptionCheckpointExceptionWRAPPED_EXCEPTION) MetadataIndexSTATE_DICT_TYPEfind_tensor_shardfind_state_dict_objectTRresultsreturnc ttttft |Dcic]\}}t |s||c}}Scc}}wN)r dictintr enumerater)rierrs h/mnt/ssd/data/python-lab/Trading/venv/lib/python3.12/site-packages/torch/distributed/checkpoint/utils.py_get_failure_dictr$"sE  S# #$'0OFAs4I#4NCO Os A A local_dictgroupct|j}dgtj|z}tj|||t t jj|S)z*Gathers all keys, and returns them sorted.Nr&) listkeysdistget_world_sizeall_gather_objectset itertoolschain from_iterable)r%r&r* gathered_keyss r#_all_gather_keysr3+sX  ! "D&*Vd.A.A%.H%HM=$e< y,,]; << state_dict process_groupctj|dk(ryt||}t|j }||z }t |dkDrt d|y)z Asserts that all ranks have the same keys in their state dict. This is a collective call which requires all ranks in ``process_group`` to join. It will also induce cross-rank communication and block CPU. rNrzq/AM$)),Bs/C?C7 C4 C//C47 D"DD"c |}|j|}d}|jrU|Jt |}t |dk(r |t tt|}t |dkDr t||}|j|} t| tr| t t| S#t$r}t|}Yd}~d}~wwxYw#t$r"}t|||j<Yd}~d}~wwxYw)aa Compute a value on each rank, then do centralized reduce on a single rank, followed by a broadcast. This method operates in the following way: Run ``map_fun`` on all ranks Gather results on rank 0 Call ``reduce_fun`` on all those values Broadcast the reduced value to all ranks. Nr)rfrrWrGr$r8r r)rrFrrRrgr) rHrbrcrdrhrirjrYrl final_results r# all_reducez_DistWrapper.all_reduces  , J%%j1:>   ' ''-h7M=!Q&B'T!Wh(?@F=!A%,T=A,,V4 l$7 8 A|$$) ,(+J ,%B/>q/AM$)),Bs/B0C0 C9 C  C C<C77C<c |}|j|}t|}t |dkDr t ||t tt|S#t$r}t|}Yd}~fd}~wwxYw)z Compute a value on each rank, then all_gather them. This method operates in the following way: Run ``map_cp`` on all ranks all_gather the values to all ranks Nr) rfrr-r$r8rr r)r)rHrbrcrYrirkrls r# all_gatherz_DistWrapper.all_gathersq (YF,,V4 )+6 }  !%dM: :DG[)) ($Q'F (sA A4 A//A4cd}|jr |}|j |}t |tr|tt|S#t$r+}t||jt |i}Yd}~bd}~wwxYw)z Compute a value on rank 0 and broadcast it. This method operates in the following way: Run ``map_cp`` on rank 0 broadcast the value N) rGrfrrFrrRrgr r)rHrbrcrYriros r# broadcastz_DistWrapper.broadcasts};?    T ,,V4 l$7 8 A|$$ ! T,TDIIq?Q3RS TsA A>!A99A>Nc^|jsytj|jy)a Add a synchronization point across all processes when using distributed. If torch.distributed is initialized, this function will invoke a barrier across the global process group. If torch.distributed is not initialized, this function is a no-op. Nr()r@r+barrierr&rKs r#rvz_DistWrapper.barrier6s }}  4::&r4)rN)__name__ __module__ __qualname____doc__r r+ ProcessGroupboolrrIrEr,rrRr)rWr-rastrrrrmrprrrtrvr4r#r?r?Ks'))*'' ',# 'x{ 'q 'A(47*;(  d1g (47*;"--"a%-d1gYQ/0 - -^&%&%"a%&%d1gY\* &% &%P**"a%* a *2%%"a%% %.'r4r?tensorindexc&|jtd|jd|j}|jjt ||jkDrRt j||jjj|jk(r||jS|D]<}t j|jj|jk(s:|cStd|jd|jd)NzCannot lookup z5 since its a ShardedTensor and no offset was providedzCould not find shard at 'z ' for FQN: '') offset ValueErrorfqn local_shardsrr8torchSizemetadata shard_offsets)rrshardsshards r# _find_shardrAs ||UYYK'\ ]   "F {{ K%++ % 6%++.77EEF%,,V%++& & ::enn22 3u|| CL 0l599+UVW XXr4cnt|dr|j|St|trt ||j S|j e|j tjdgt|jzk(r|Std|jd|j d|S)N__get_tensor_shard__rFQN: '1' is not a ShardedTensor, can't find by offset: 'r) hasattrrrgr rrrrrr8sizerr)rrs r#rrVsv-.**511&-(65)000 || <<5::qcC ,>&>? ?MUYYKPQVQ]Q]P^^_ `   Mr4c"|j|vrtd|jd||j}t|tjr t ||S|j &td|jd|j d|S)NzCould not find FQN: 'rrr)rrrgrTensorrr)r5rrTs r#rrfs yy "0 1=>> UYY C#u||$ e,,  !UYYKPQVQ]Q]P^^_ `   Jr4abcRt||Dcgc] \}}||z c}}Scc}}wrziprri_ai_bs r#_element_wise_addrt$&)!Qi 0(#sC#I 00 0#cRt||Dcgc] \}}||z  c}}Scc}}wrrrs r#_element_wise_subrxrrceZdZdejdedeffd ZejfdededefdZ defdZ de fd Z de fd Z d Zd d ZxZS) _ReaderView base_streamrr8cnt|||_||_||_|j dyNr)superrIrr8rseek)rHrrr8 __class__s r#rIz_ReaderView.__init__}s/  & ! r4whencerc|tjk(r|j|z}n?|tjk(r,tj}|j|jz|z }|j j ||Sr)osSEEK_SETrSEEK_ENDr8rr)rHrrs r#rz_ReaderView.seeksa R[[ [[6)F r{{ "[[FkkDHH,6F$$VV44r4cP|jj|jz Sr)rtellrrKs r#rz_ReaderView.tells $$&44r4c6|jjSr)rreadablerKs r#rz_ReaderView.readable((**r4c6|jjSr)rseekablerKs r#rz_ReaderView.seekablerr4c|j|jz }|dk(ryt||kDrt|d|}|jj |Sr)r8r memoryviewrreadinto)rHrmax_sizes r#rz_ReaderView.readintosS88diik) q= q6H 1 ix(A((++r4c|j|jz }|dk(s||kDr|}|jj|S)N)r8rrread)rHrrs r#rz_ReaderView.reads?88diik) 2:D$$T**r4)r)rwrxryioIOBaserrIrrrrr|rrrr __classcell__)rs@r#rr|srBIIs/1kk535555c5+$++$+,+r4rfilerlengthct|||Sr)r)rrrs r#_create_file_viewrs tVV ,,r4 device_type device_idc|dk(ry|d|S)zDevice info normalization.cpu:r~)rrs r#_normalize_device_inforse]!I; ''r4Fc#Ktrtjrtjdk(ret j }|j  d|jt|}|jdjdydy#|jt|}|jdjdwxYww)Nrtime ) ENABLE_PROFILEr+ is_availablerEcProfileProfileenabledisabler sort_stats print_stats)profilerstatss r#_profilerst002dmmo6J##% 5     (OE   V $ 0 0 4    (OE   V $ 0 0 4sACBAC=CCc<tdtffd }|S)Nrct|dk(rtjdjdt j }|j jDcgc](}|j|jk(s|j*}}d|vrd|vs J||f|d|d<n(d|vrd|vs J||f|d|d<ntd||dfi|S|i|Scc}w) NzThe argument order of zG has been changed. Please check the document to avoid future breakages.storage_writerrstorage_readerzUnexpected kwonlyargs = r) r8warningswarnrwinspect signature parametersvalueskind KEYWORD_ONLYname RuntimeError)argskwargssigp kwonlyargsfuncs r# inner_funcz!_api_bc_check..inner_funcs t9> MM(8GG ##D)C # 5 5 7166Q^^;SJ :-'v5Ef~E5+/7'(!Z/'v5Ef~E5+/7'("%=j\#JKKQ*6* *(( (s $C)C))rr)rrs` r# _api_bc_checkrs( 4[)s)), r4r)?rrrr/rrcollections.abcr contextlibr functoolsrpstatsrtypingrrr r r r rtorch.distributed distributedr+'torch.distributed._shard.sharded_tensorr -torch.distributed._shard.sharded_tensor.shardrapirrrrrrr__all__rrr)rrr$r}r{r.r3r=r?rrrrrrrrrrrrrr~r4r#rs-  $%@@ A? 5  8 9 CL CL %,,- . # !FJ=S#X='/0A0A'B=X=NR S#X /78I8I/J   *s's'lY YmYY* ell = U\\   } QT 1#18C=1T#Y11#18C=1T#Y1%+"))%+P-BII-s-C-BII- ((((   r4