L i dZddlZddlZddlZddlZddlZddlZddlZddlZddl Z ddl m Z m Z ddl mZddlmZmZddlmZddlmZddlmZmZmZddlZddlZddlmZdd lm Z dd l!m"Z"m#Z#m$Z$m%Z%dd l&m'Z'd d l(m)Z)d dl*m+Z+d dl,m-Z-m.Z.m/Z/m0Z0mZe0r"ejbeejde/rddl3m4Z5e.rddl6m7Z7ejpe9Z:dZ;deejxejzffdZ>dUdZ?dUdZ@dUdZAdZBdZCdZDdZEdVdedeeFdefdZGdejdfd eIeeFeJfdeeFd!eejdejxfd"ZKd#ZLed$eFfd%ZMGd&d'e'ZNGd(d)ZOGd*d+e%ZPd,ejXjjDd-eFfd.ZRdUd/ZSdUd0ZTd1ZUGd2d3ZVeGd4d5ZWdWd6ZXGd7d8e%ZYGd9d:e'ZZGd;de#Z\d?Z]d@Z^dAe_e`eJfde_e`eJffdBZadCZbdXdDZcdEZddYdFZedVdGZfdHZgdIZhe-r>ddlimjcmZkekjdZdJZmekjdKZndLZodMZpeGdNdOZqGdPdQejjZtGdRdSe7ZudTZvy)[z( Torch utilities for the Trainer class. N)IteratorMapping)contextmanager) dataclassfield)chain) StreamHandler)AnyOptionalUnion)nn)DatasetIterableDataset RandomSamplerSampler)DistributedSampler)is_deepspeed_zero3_enabled) BatchEncoding)is_sagemaker_mp_enabledis_torch_availableis_torch_xla_availableis_training_run_on_sagemakerlogging) LRSchedulerct|dr!|jt|jSt|dr |jSy)N batch_samplersampler)hasattrrget_dataloader_samplerr) dataloaders c/mnt/ssd/data/python-lab/Trading/venv/lib/python3.12/site-packages/transformers/trainer_pt_utils.pyr r AsCz?+ 0H0H0T%j&>&>?? Y '!!! (tensor_or_arrayct|tjr=ttdrtj|}|S|j dkr|d}|St j|}|S)N atleast_1dr) isinstancetorchTensorrr&ndimnp)r$s r"r&r&Hsj/5<<0 5, '#..?O   ! !A %-d3O --8 r#c4t|}t|}t|jdk(s|jd|jdk(rtj||fdS|jd|jdzt |jd|jdf|jddz}|j ||}||d|jdd|jdf<|||jddd|jdf<|S)z`Concatenates `tensor1` and `tensor2` on first axis, applying padding on the second if necessary.rrdimN)r&lenshaper(catmaxnew_full)tensor1tensor2 padding_index new_shaperesults r"torch_pad_and_concatenater:Ss!G!G 7==Q'--"2gmmA6F"Fyy'7+33q!GMM!$44c'--:JGMMZ[L\6]^ahananopoqarrI  i 7F5z nested_concat..s$tRVRSUV]1a}MMtsrFz(Unsupported type for concatenation: got )r'r(r)typelisttuplezipr:ritemsrGr+ndarrayrB TypeError)tensors new_tensorsr7krIs ` r"rGrGws9 w -*[%,,2WG}[ 11 STXY`TaSbbghlmxhygzz{ | 1'D%=)tG}tZ]^egrZsttt GU\\ *(+][[ GW %tG}ZaZgZgZi jRVRSUVQ aQ}M M j   GRZZ ((+][[B4=/RSS ks<E cvt|ttfr|D]}t|}||cSyt|tr'|j D]}t|}||cSyt|t jtjfr)t|jdk\r|jdSdSy)zV Find the first dimension of a tensor in a nested list/tuple/dict of tensors. Nrr) r'rMrNfind_batch_sizervaluesr(r)r+rQr0r1)rSrIr9values r"rWrWs'D%=) A$Q'F!   GW %^^% E$U+F!   GellBJJ7 8#&w}}#5#:w}}QDD 9r#c t|ttfrt|d|DSt|tr<t||j Dcic]\}}|t |c}}S|j}|jtjk(r|jtj}|jScc}}w)zENumpify `tensors` (even if it's a nested list/tuple/dict of tensors).c32K|]}t|ywN)nested_numpifyrHrIs r"rKz!nested_numpify..s@1^A.@)r'rMrNrLrrPr]cpudtyper(bfloat16tofloat32numpyrSrUrIs r"r]r]s'D%=)tG}@@@@'7#tG}w}}Otq!a!22OPP Aww%..  DD  779PsC c Zt|ttfrt|d|DSt|tr<t||j Dcic]\}}|t |c}}St|tjr|jS|Scc}}w)zDDetach `tensors` (even if it's a nested list/tuple/dict of tensors).c32K|]}t|ywr\) nested_detachr^s r"rKz nested_detach..s?!]1-?r_) r'rMrNrLrrPrir(r)detachrfs r"riris'D%=)tG}?w??? GW %tG}gmmoNdaaq!11NOO)'5<<@7>> MgMOB' ctrddlmcm}t |t t fr#t|fdt|DSt |trPt|t|jDcic]\}\}}|t|d|c}}}St|}|j|tjSt!dcc}}}w)Nrc3FK|]\}}t|d|yw)_N)nested_xla_mesh_reduce)rHirInames r"rKz)nested_xla_mesh_reduce..s( idaQR!7dV1QC=!I is!rnz;Torch xla must be installed to use `nested_xla_mesh_reduce`)rtorch_xla.core.xla_modelcore xla_modelr'rMrNrL enumeraterrPror& mesh_reducer(r2 ImportError)rSrqxmrprUrIs ` r"roros-- ge} - 4= iV_`gVh ii i gw ' 4=NWX_XeXeXgNhiiFQ*1asm<<i W%~~dGUYY77WXX jsC% tensornum_total_examplesreturnc T t|ttfrt|fd|DSt|tr=t||j Dcic]\}}|t |c}}St|j}ttjDcgc]}|j}}tj||tj|d}|d}|Scc}}wcc}w#t $r t!dwxYw)Nc36K|]}t|ywr\)distributed_concat)rHrIrzs r"rKz%distributed_concat..sZa 216H IZrr-(Not currently using distributed training)r'rNrMrLrrPr~r& contiguousrangedistget_world_sizeclone all_gatherr(r2AssertionError)ryrzrUrIrnoutput_tensorsconcats ` r"r~r~sI fudm ,4<ZSYZZ Z fg &4q1  )//0F !jO IGHHIs4/D.D!D 8D8D8D 7D DD'cudascalarsdevicecH tj||}ttjDcgc]}|j }}tj ||tj|d}||d|}|Scc}w#t$r tdwxYw)N)rrr-r) r(ryrrrrrr2r)rrzrtensorized_scalarrnrrs r"distributed_broadcast_scalarsrs I!LL@=B4CVCVCX=YZ+113ZZ (9:>q1  )//0F [ IGHHIs6B B7B B B!ct|dkDrE|D]?}|jtustj|j |jAyy)Nr)r0category UserWarningwarningswarnmessage)caught_warningsws r"reissue_pt_warningsrsE ?a  5Azz, aii4 5 r# local_rankc#xK|dvrtjd|dk(rtjyyw)z Decorator to make all processes in distributed training wait for each local_master to do something. Args: local_rank (`int`): The rank of the local process. )rNr)rbarrier)rs r"torch_distributed_zero_firstrs1   Q s8:c,eZdZdZfdZfdZxZS)DistributedSamplerWithLoopa Like a torch.utils.data.distributed.DistributedSampler` but loops at the end back to the beginning of the shuffled samples to make each process have a round multiple of batch_size samples. Args: dataset (`torch.utils.data.Dataset`): Dataset used for sampling. batch_size (`int`): The batch size used with this sampler kwargs (`dict[str, Any]`, *optional*): All other keyword arguments passed to `DistributedSampler`. c 4t||fi|||_yr\)super__init__ batch_size)selfdatasetrkwargs __class__s r"rz#DistributedSamplerWithLoop.__init__s +F+$r#cHtt| }t||jzdk(rdn$|jt||jzz }|j t|j |jzkrdnd}|||||zz }t|S)Nrr) rMr__iter__r0rrankr num_replicasiter)rindices remainderstart_remainderrs r"rz#DistributedSamplerWithLoop.__iter__suw')*W71->>$r#cttt|j}||d|jt|z z }t||jk(s!Jdt|d|jd||j |j z|j dz|j z}t||j k(s!Jdt|d|j dt|S)NzIndices length z and total size z mismatchedrz and sample number )rMrr0rrrrrrrs r"rz%SequentialDistributedSampler.__iter__zsuS./0 7=dooG <>>7|t. c'l^+;DOO;LK X . $))d&6&66$))a-4K[K[9[\7|t/// c'l^+>t?O?O>PP[ \ /G}r#c|jSr\)rrs r"__len__z$SequentialDistributedSampler.__len__sr#)NNN)rrrrrrrrr#r"rrWs%2" r#rrrctjdkr t|St|tjtjS)Nr)rr)xr world_sizerrglobal_ordinal)rrs r"get_tpu_samplerrs9 }}!W%% gBMMO"J[J[J] ^^r#ct|ttfrt|fd|DSt j ||g|j ddS)z\Create the same nested structure as `arrays` with a first dimension always at `num_samples`.c36K|]}t|ywr\)nested_new_like)rHxrs r"rKz"nested_new_like..sLOA{;LrrNr=)r'rMrNrLr+r?r1)rrr7s ` r"rrsP&4-(tF|LVLLL << k5UFLLQRQSDT5U VVr#ctj|||jd|f|jddz}||ddd|jdf<|S)zmExpand the `arrays` so that the second dimension grows to `new_seq_length`. Uses `padding_index` for padding.rr/Nr=r)r+r?r1)rnew_seq_lengthr7r9s r" expand_likersX \\&- Q7X[a[g[ghihj[k7k lF#)F1 Q  Mr#c t|ttfrt|fd|DSt|tr=t||j Dcic]\}}|t |c}}S|dScc}}w)zQTruncate `tensors` at `limit` (even if it's a nested list/tuple/dict of tensors).c36K|]}t|ywr\)nested_truncate)rHrIlimits r"rKz"nested_truncate..sH1_Q6HrN)r'rMrNrLrrPr)rSrrUrIs ` r"rrsv'D%=)tG}HHHH'7#tG}w}}Wtq!aE!::WXX 6E?Xs B c*eZdZdZddZdZdZdZy)DistributedTensorGathereraS A class responsible for properly gathering tensors (or nested list/tuple of tensors) on the CPU by chunks. If our dataset has 16 samples with a batch size of 2 on 3 processes and we gather then transfer on CPU at every step, our sampler will generate the following indices: `[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1]` to get something of size a multiple of 3 (so that each process gets the same dataset length). Then process 0, 1 and 2 will be responsible of making predictions for the following samples: - P0: `[0, 1, 2, 3, 4, 5]` - P1: `[6, 7, 8, 9, 10, 11]` - P2: `[12, 13, 14, 15, 0, 1]` The first batch treated on each process will be: - P0: `[0, 1]` - P1: `[6, 7]` - P2: `[12, 13]` So if we gather at the end of the first batch, we will get a tensor (nested list/tuple of tensor) corresponding to the following indices: `[0, 1, 6, 7, 12, 13]` If we directly concatenate our results without taking any precautions, the user will then get the predictions for the indices in this order at the end of the prediction loop: `[0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1]` For some reason, that's not going to roll their boat. This class is there to solve that problem. Args: world_size (`int`): The number of processes used in the distributed training. num_samples (`int`): The number of samples in our dataset. make_multiple_of (`int`, *optional*): If passed, the class assumes the datasets passed to each process are made to be a multiple of this argument (by adding samples). padding_index (`int`, *optional*, defaults to -100): The padding index to use if the arrays don't all have the same sequence length. Nc tjdt||_||_||n||z}t t j||z |z|_|j|z|_ d|_ d|_ ||_ y)NzRDistributedTensorGatherer is deprecated and will be removed in v5 of Transformers.) rrrrrrr+r total_samplesprocess_length_storage_offsetsr7)rrrmake_multiple_ofr7rs r"rz"DistributedTensorGatherer.__init__s `  %&#3#;ZN^A^  z)A!BCjP"00J>  *r#cx|y|jVt||j|j|_t t d|j|j |_|j|j|\}|_t |jD]}|j|xx|z cc<y)z Add `arrays` to the internal storage, Will initialize the storage to the full size at the first arrays passed so that if we're bound to get an OOM, it happens at the beginning. NrFr) rrrr7rMrrr_nested_set_tensorsr)rr slice_lenrps r" add_arraysz$DistributedTensorGatherer.add_arrayss >  == +FD4F4FVZVhVhiDM q$*<*Q>Q!RSDM#'#;#;DMM6#R 4=t' *A MM!  )  *r#ct|ttfrNt||Dcgc]\}}|j ||}}}|ddt |d|DfS|j d|jzdk(s%Jd|jd|j dd|j d|jz}t|jD]}t|j dk(r/|||z|dz|z||j||j||zJt|j dkDrD|j d|j dkr%t||j d|j}|||z|dz|z||j||j||zd|j df<||fScc}}w)Nrc3&K|] }|d yw)rNr)rHrs r"rKz@DistributedTensorGatherer._nested_set_tensors..s-Cqad-Csza.P)'6<<?RVRdRdeG^d MQUi$7_ a(4==+;i+GGIZ6<Not all data has been set. Are you sure you passed all values?)rrrloggerwarningrrrs r"finalizez"DistributedTensorGatherer.finalize sL ==  == t22 2 NN[ \t}}d.>.>??r#)Nr)rrrrrrrrrr#r"rrs+Z + *"* @r#rc6eZdZUdZdZeed<dZeed<ddZ y) LabelSmoothera@ Adds label-smoothing on a pre-computed output from a Transformers model. Args: epsilon (`float`, *optional*, defaults to 0.1): The label smoothing factor. ignore_index (`int`, *optional*, defaults to -100): The index in the labels to ignore when computing the loss. g?epsilonr ignore_indexcnt|tr|dn|d}|r1|dddddfj}|dddfj}tjj |d }|j |j dz k(r|jd}|j|j}tj|d}|jd|}|jdd tj }|j|d |j|d |j!|j#jz } |j| z }|j| |j$dzz }d|j&z |z|j&|zzS) Nlogitsr.rrr-)min)r.indexT)r.keepdimrag)r'dictrr functional log_softmaxr. unsqueezeeqrr(clampgathersumrd masked_fill_numellongr1r) r model_outputlabels shift_labelsr log_probs padding_masknll_loss smoothed_lossnum_active_elementss r"__call__zLabelSmoother.__call__(s+5lD+Ih'|\] C"aK(335FCG_//1F]]..v2.>> ::<9==?Q. .%%b)Fyy!2!23 V+##&#9! "d%-- P lC0""<5+002\5F5F5H5L5L5NN<<>$77%))+/BY__UWEX/XY DLL H,t||m/KKKr#NF) rrrrrfloat__annotations__rrrrr#r"rrs#GUL#Lr#rcl|"tt|dzzd}|dk(rd}tjt|}||z}t dt|Dcgc]}||||zj }}|Dcgc]}t |fdd}}|Dcgc] }|d } }tjtj| j} || d|ddc|dd<|| d<|Dcgc] }|D]}| c}}Scc}wcc}wcc}wcc}}w) a Return a list of indices so that each slice of `batch_size` consecutive indices correspond to elements of similar lengths. To do this, the indices are: - randomly permuted - grouped in mega-batches of size `mega_batch_mult * batch_size` - sorted by length in each mega-batch The result is the concatenation of all mega-batches, with the batch of `batch_size` containing the element of maximum length placed first, so that an OOM happens sooner rather than later. 2rr generatorc|Sr\r)rplengthss r"z,get_length_grouped_indices..[s 71:r#T)keyreverse) rr0r(randpermrtolistsortedargmaxryitem) r$rmega_batch_multr"rmegabatch_sizerp megabatches megabatchmegabatch_maximumsmax_idxs ` r"get_length_grouped_indicesr3DsMc'lzA~>C a OnnS\Y?G$z1NEJ1cRYl\jEkl71q>1299;lKl^ijQZ6))=tLjKjBMMI')A,/MMll5<<(:;<AACG1)I >qA >A >>mjN ?s#D!D&$D+ D0c VeZdZdZ d dedeedeeedeefdZ dZ d Z y) LengthGroupedSamplerz Sampler that samples indices in a way that groups together features of the dataset of roughly the same length while keeping a bit of randomness. Nrrr$model_input_namecz| | td||_|Q||nd}t|dttfr||dvrtd|d|Dcgc]}t ||}}n?t|t jr%tjd|j}||_ ||_ ycc}w)N,One of dataset and lengths must be provided. input_idsrXCan only automatically infer lengths for datasets whose items are dictionaries with an '' key.zcIf lengths is a torch.Tensor, LengthGroupedSampler will be slow. Converting lengths to list[int]...) ValueErrorrr'rrr0r(r)rinfor)r$r")rrrr$r6r"features r"rzLengthGroupedSampler.__init__ms ?wKL L$ ?3C3O/U` gaj4*?@DT\cde\fDf ()1FMM's7#345MGM  . KKu nn&G "NsB8c,t|jSr\)r0r$rs r"rzLengthGroupedSampler.__len__s4<<  r#cpt|j|j|j}t |SNr!)r3r$rr"rrs r"rzLengthGroupedSampler.__iter__s),T\\4??VZVdVdeG}r#)NNNN) rrrrrr rrMstrrrrrr#r"r5r5gs\&*'+*. ##'"#$s)$ # #3- #:!r#r5cxeZdZdZ ddedeedeedeededed eeed ee fd Z d e fd Z y)DistributedLengthGroupedSamplerz Distributed Sampler that samples indices in a way that groups together features of the dataset of roughly the same length while keeping a bit of randomness. Nrrrrseed drop_lastr$r6c (| | td|3tjs tdtj}|3tjs tdtj }||_||_||_d|_ ||_ |Q||nd}t|dttfr||dvrtd|d|D cgc]} t| |}} n?t|tj r%t"j%d|j'}||_|jrmt|j(|jzdk7rHt+j,t|j(|jz |jz |_n:t+j,t|j(|jz |_|j.|jz|_||_ycc} w)Nr8rrr9r:r;znIf lengths is a torch.Tensor, DistributedLengthGroupedSampler will be slow. Converting lengths to list[int]...)r<rrrrrrrrepochrFr'rrr0r(r)rr=r)r$rrrrrE) rrrrrrErFr$r6r>s r"rz(DistributedLengthGroupedSampler.__init__s ?wKL L  $$&"#QRR..0L <$$&"#QRR==?D$(  " ?3C3O/U` gaj4*?@DT\cde\fDf ()1FMM's7#345MGM  . KK  nn&G  >>c$,,/$2C2CCqH $yy#dll*;d>O>O*OSWSdSd)deD #yyT\\):T=N=N)NOD **T->->> )NsHr{ctj}|j|j|jzt |j |j|}|js||d|jt|z z }n|d|j}t||jk(sJ||j|j|j}t||jk(sJt|SrA)r( Generator manual_seedrErHr3r$rrFrr0rrrr)rgrs r"rz(DistributedLengthGroupedSampler.__iter__s OO  dii$**,-,T\\4??VWX~~ wA$//CL"@B BG/0G7|t...$))doo8I8IIJ7|t/////G}r#)NNNrFNN) rrrrrr rrrMrBrrrrr#r"rDrDs&*&*"'+*.77'"7sm 7 sm 7  77$s)$7#3-7r(r#rDc BeZdZdZ d dededededef dZdZd Z y ) ShardSamplera Sampler that shards batches between several processes. Dispatches indices batch by batch: on 2 processes with batch size 4, the first two batches are `[0, 1, 2, 3, 4, 5, 6, 7]` and `[8, 9, 10, 11, 12, 13, 14, 15]`, which shard into `[0, 1, 2, 3]` and `[8, 9, 10, 11]` for GPU-0 and `[4, 5, 6, 7]` and `[12, 13, 14, 15]` for GPU-1. The sampler thus yields `[0, 1, 2, 3, 8, 9, 10, 11]` on GPU-0 and `[4, 5, 6, 7, 12, 13, 14, 15]` on GPU-1. rrrF num_processes process_indexc||_||_||_||_||_||zx|_}|rt ||zn tjt ||z }||z|_ yr\) rrrFrOrPtotal_batch_sizer0rrtotal_num_samples)rrrrFrOrPrR num_batchess r"rzShardSampler.__init__ss $"**3= 3MM 0:Cc'l&66SVW^S_brSrIs !,/?!?r#cttt|j}t||jkr7||d|jt|z z }t||jkr7g}t|j |j z|j|jD]}|||||j zz }t|Sr\) rMrr0rrSrrPrRr)rrr9 batch_starts r"rzShardSampler.__iter__suS./0'lT333 wH$"8"83w<"GI IG'lT333 43E3E!EtG]G]_c_t_tu KK gkK$//,IJ JF KF|r#c4|j|jzSr\)rSrOrs r"rzShardSampler.__len__s%%););;;r#N)rFrr) rrrrrrrrrrrr#r"rNrNsZ @@@ @  @  @&  If your IterableDataset implements some randomization that needs to be applied the same way on all processes (for instance, a shuffling), you should use a `torch.Generator` in a `generator` attribute of the `dataset` to generate your random numbers and call the [`~trainer_pt_utils.IterableDatasetShard.set_epoch`] method of this object. It will set the seed of this `generator` to `seed + epoch` on all processes before starting the iteration. Alternatively, you can also implement a `set_epoch()` method in your iterable dataset to deal with this. Args: dataset (`torch.utils.data.IterableDataset`): The batch sampler to split in several shards. batch_size (`int`, *optional*, defaults to 1): The size of the batches per shard. drop_last (`bool`, *optional*, defaults to `False`): Whether or not to drop the last incomplete batch or complete the last batches by using the samples from the beginning. num_processes (`int`, *optional*, defaults to 1): The number of processes running concurrently. process_index (`int`, *optional*, defaults to 0): The index of the current process. seed (`int`, *optional*, defaults to 0): A random seed that will be used for the random number generation in [`~trainer_pt_utils.IterableDatasetShard.set_epoch`]. rrrFrOrPrEct||_||_||_||_||_||_d|_d|_y)Nr)rrrFrOrPrErH num_examples)rrrrFrOrPrEs r"rzIterableDatasetShard.__init__?s? $"**  r#cv||_t|jdr|jj|yy)N set_epoch)rHrrr])rrHs r"r]zIterableDatasetShard.set_epochQs. 4<< - LL " "5 ) .r#c#Kd|_t|jdst|jdrjt|jjt j r<|jjj|j|jz|j|jz}t|j|jz|jdz|jz}d}g}|jD]Y}|xjdz c_|j|t||k(s8|D] }|| ||j!}g}[|j"sRt|dkDrC||j!}t||kr||z }t||kr|D] }|| yyyw)Nrr]r"r)r[rrr'r"r(rJrKrErHrrOrrPrr0copyrF)rreal_batch_size process_slice first_batch current_batchelementrps r"rzIterableDatasetShard.__iter__Vs k2 k24<<115??C LL " " . .tyy4::/E F//D,>,>>d004??BTEWEWZ[E[_c_n_nDno   || #G    "    )=!_4&+A'**+&"/"4"4"6K "  #~~#m"4q"8"+002 m$6, m$6" '#A&& ' #9~sD;G>A1G0Gc0|jr> %$//D&C99C>ctt|t|z dz}tjt|d|dS)zN Convert seconds to hh:mm:ss.msec, msecs rounded to 2 decimal places. d)secondsrD02d)rabsdatetime timedelta)secsmsecs r"_secs2timedeltar|sD s4#d)#$s* +D  T3 4Ad3Z @@r#metricsc|j}|jD]e\}}d|vr |dz d||<d|vrt|||<)|dk(rt|dz d||<Ct ||t sWt |d||<g|S) z Reformat Trainer metrics values to a human-readable format. Args: metrics (`dict[str, float]`): The metrics returned from train/evaluate/predict Returns: metrics (`dict[str, float]`): The reformatted metrics _mem_MB_runtime total_flosGFr)r_rPr|rr'rround)r} metrics_copyrUvs r"metrics_formatrs<<>L""$*1 a.s9!#c!f+9 c3DK|]}tt|ywr\rrs r"rKzlog_metrics..sB!#c!f+Brz z <z = >)is_world_process_zeroprintrr3rXr*keys)rsplitr}metrics_formattedk_widthv_widthr&s r" log_metricsrs^  % % ' F5' ()&w/9'899GB'8'?'?'ABBG',,./L 3r' l#3'8'=ay[&IJKLr#c|jsytjj|jj |d}t |d5}tj||ddddd|rtjj|jj d}tjj|r*t |5}tj|}dddni}j|t |d5}tj||dddddyy#1swYxYw#1swYVxYw#1swYyxYw)a Save metrics into a json file for that split, e.g. `train_results.json`. Under distributed environment this is done only for a process with rank 0. Args: split (`str`): Mode/split name: one of `train`, `eval`, `test`, `all` metrics (`dict[str, float]`): The metrics returned from train/evaluate/predict combined (`bool`, *optional*, defaults to `True`): Creates combined metrics by updating `all_results.json` with metrics of this call To understand the metrics please read the docstring of [`~Trainer.log_metrics`]. The only difference is that raw unformatted numbers are saved in the current method. Nz _results.jsonrrT)indent sort_keyszall_results.json) rospathjoinargs output_diropenjsondumpexistsloadupdate)rrr}combinedrf all_metricss r" save_metricsrs$$  % % ' 77<< ,,}.E FD dC8A '1Q$78ww||DII002DE 77>>$ d +q"iil  + +K7# $_ @ IIk1Q$ ? @ @88  + +  @ @s$D9EE9EEEc|jsytjj|jj d}|j j|y)z Saves the Trainer state, since Trainer.save_model saves only the tokenizer with the model. Under distributed environment this is done only for a process with rank 0. Nztrainer_state.json)rrrrrrstate save_to_json)rrs r" save_stater>sD  % % ' 77<< ,,.B CDJJD!r#cntrdndtfd|jDS)zo Calculate model's total param count. If trainable_only is True then count only those requiring grads. cRt|dr |jS|jS)Nds_numel)rrrps r"rz$get_model_param_count..numelQs !(J!71:: FQWWY Fr#c"|jSr\)rrs r"rz$get_model_param_count..numelVs779 r#c3LK|]}r |js|ywr\) requires_grad)rHrrtrainable_onlys r"rKz(get_model_param_count..Ys]A>Q__uQx]s$ $)rr parameters)modelrrs `@r"get_model_param_countrKs2"# G   ]!1!1!3] ]]r#c  |#|Dcgc]}tj|c}ng}g}|jD]U\ }t|||}||Dcgc]3t |t |st  fd|Ds d5c}z }W||jD cgc] t fd|Dr c} z }|Scc}wcc}wcc} w)zZ Returns the names of the model parameters that are not inside a forbidden layer. c3fK|](}|jdj*yw)rDNsearchlower)rHpatternrJrqs r"rKz&get_parameter_names..js-f'$q}':':'<=fs.1rDc3\K|]#}|jj%ywr\r)rHrrUs r"rKz&get_parameter_names..ns /rgqwwy0I/rs),)recompilenamed_childrenget_parameter_namesr'rNany _parameters) rforbidden_layer_typesforbidden_layer_namesrforbidden_layer_patternsr9child child_paramsrJrUrqs ``@r"rr\s G\Fg,ABG BmoF++- e*52GI^_ ! eU+@%ABfMefffAaSM    $$C/rYq/r,rF M! C  sC 8C%C?Cct|j}|jj|k(r |jSt |dk(ry|D]}t ||}||cSy)z Gets a class from a module by its name. Args: module (`torch.nn.Module`): The module to get the class from. name (`str`): The name of the class. rN)rMchildrenrrr0get_module_class_from_name)modulerqmodules_children child_module module_classs r"rrtsnFOO-.   D(  ! #, $L5lDIL'## $r#c|r]|D]W}tjj||}tjj|sCtj|Yyyr\)rrrisfileremove)is_main_processr filenamesfilenamefiles r"remove_dummy_checkpointrsH! H77<< H5Dww~~d# $ r#cv|di|}t|tr|dn|d}||z}|j||S)Nlossrr)r'rbackward)rinputsgradient_accumulation_stepsoutputsrs r"smp_forward_backwardrsA/&/",Wd";wv ++ t r#c|di|S)Nrr)rrs r"smp_forward_onlyrsvr#c lt|ttfrt|d|DSt|tr<t||j Dcic]\}}|t |c}}St|tjstdt|dtj|tjj}|Dcgc] }t|}}tj|Dcgc]}|j!c}dScc}}wcc}wcc}w)Nc32K|]}t|ywr\) smp_gatherr^s r"rKzsmp_gather..s>! 1 >r_z Can't gather the values of type z-, only of nested list/tuple/dicts of tensors.rr-)r'rMrNrLrrPrr(r)rRsmp allgather CommGroupDP_GROUPr&r2r`)ryrUr all_tensorsrIs r"rrs ftUm ,4<>v>> >  %4.sE 1! 4Er_) r'rMrNrLrrPrrjrr`)ryrUrs r"rrs ftUm ,4<EfEE E  %4..s^C#SE]E]:]C^srzThe config file at z had unknown keys (zu), please try upgrading your `transformers` version or fix (and potentially remove these keys) from your config file.r) rrriorrrr*r0r<)r json_file open_filer config_dict extra_keyss` r"from_json_filez AcceleratorConfig.from_json_file's "ww~~i8BGGd y# 8 'A))A,K '^;^^ z?Q %i[0CJ<P]] ![!! ' 's BB%c@tj|jSr\)r_deepcopy__dict__rs r"to_dictzAcceleratorConfig.to_dict6s}}T]]++r#c:|jj||Sr\)r pop)rr&rs r"r zAcceleratorConfig.pop9s}}  g..r#r\)rrrrrrrrrr rrrrrr classmethodrr r rr#r"rrs,*Z  E M4(- O (htn  L$"' u "$  L$49 Z   4 (4. "' K "$ " ",/r#rcJeZdZdZdfd Zd deddfdZddeefdZ xZ S) LayerWiseDummyOptimizera For Layer-wise optimizers such as GaLoRE optimizer, the optimization step is already done through the post gradient hooks. Therefore the trick is to create a dummy optimizer that can take arbitrary args and kwargs and return a no-op during training. Initial idea from @hiyouga in LLaMA-Factory: https://github.com/hiyouga/LLaMA-Factory/commit/8664262cde3919e10eaecbd66e8c5d356856362e#diff-ebe08ab14496dfb9e06075f0fdd36799ef6d1535cc4dd4715b74c4e3e06fe3ba Nc tjdd}||_t||gd|j ddiy)NrrggMbP?)r(randnoptimizer_dictrrget)rrr dummy_tensorrs r"rz LayerWiseDummyOptimizer.__init__Hs<{{1a( , ,$ 40G)HIr# set_to_noner{cyr\r)rrs r" zero_gradz!LayerWiseDummyOptimizer.zero_gradM r#cyr\r)rclosures r"stepzLayerWiseDummyOptimizer.stepPrr#r\T) rrrrrrrr rrrrs@r"rr=s1J  T T  HUO r#rc.eZdZdZfdZdZdZxZS)LayerWiseDummySchedulera For Layer-wise optimizers such as GaLoRE optimizer, the optimization and scheduling step are already done through the post gradient hooks. Therefore the trick is to create a dummy scheduler that can take arbitrary args and kwargs and return a no-op during training. cT|d|_tdi|}d}t| ||y)Nrgrr) default_lrrrr)rrrrm last_epochrs r"rz LayerWiseDummyScheduler.__init__\s/ ,+5f5   J/r#c  |jg}|j_|jjjDcgc]}|jDcgc]}|d c}!}}}t t |}|Scc}wcc}}w)Nrg)r!rmrrXrnrMr)rlrsrkgroupparam_wise_lrss r"get_lrzLayerWiseDummyScheduler.get_lrbs| >> %KO>>KhKhKoKoKqBG%*<*<=t=Nun-.C >sB A; B;Bc|jSr\)base_lrsrs r"_get_closed_form_lrz+LayerWiseDummyScheduler._get_closed_form_lros }}r#)rrrrrr'r*rrs@r"rrTs0 r#rc |j}d} |r|jj||y|jj||y#t$r0}t j |j||Yd}~yd}~wwxYw)zIHelper to set RNG state for a specific device type (CUDA, NPU, MLU, MUSA)zDidn't manage to set back the RNG states of the {backend} because of the following error: {exception} This won't yield the same results as if the training had not been interrupted.)backend exceptionN)rrandomset_rng_state_all set_rng_state Exceptionrerrorformat) device_name device_modulecheckpoint_rng_stateis_distributeddevice_state_key err_templaterqs r"set_rng_state_for_devicer:ss"((*MLL   2 23GHX3Y Z  . ./CDT/U V L \(((JKKLs AA B &BB )rr\)NNrr)r)wrr_rxrrrrrsysrcollections.abcrr contextlibr dataclassesrr itertoolsrrr typingr r r rer+r(torch.distributed distributedrr torch.utils.datarrrrtorch.utils.data.distributedrintegrations.deepspeedrtokenization_utils_baserutilsrrrr add_handlerstdouttorch_xla.runtimeruntimertorch.optim.lr_schedulerr get_loggerrrr r)rQr&r:rBrGrWr]rirorr~rrMrrrrrrrdatarrrrrrr3r5rDrNrYrrr|rrBrrrrrrrr!smdistributed.modelparallel.torch modelparallelrrrrrrrrk Optimizerrrr:rr#r"rRsJ  -%(!'' MM;>2 !G cjj12"4   H %"ellBJJ&> ?$$T.E$  NY"IsI IQTI*)-%1U\\&%9I %U # $I  I U\\ "I \\ I&5 S  !3811h5 75 p_U[[--55_3_ Wi@i@X 'L'L 'LT ?F(7(VR&8Rj,<7,<^fk?fkZ0ADe,c5j1A4WLt#@L "^"0$( 33 SXXZSXXZ @. C/C/ C/L ekk33 .k> Lr#