L ig&JdZddlmZmZmZddlZddlmZddlmZm Z m Z m Z m Z m Z mZmZmZmZmZmZmZmZmZmZddgZGd deZd d ed e d ed ed e d ze_deedeedeedeedeedeedededededededededededef"d Zdeedeedeedeedeedeedededededededededededef"d!Ze e" d%deedeedeedeedeedeeded#eededededededededededef$d$Zy)&z'Implementation for the NAdam algorithm.)castOptionalUnionN)Tensor)_capturable_doc_default_to_fused_or_foreach_differentiable_doc_disable_dynamo_if_unsupported _foreach_doc!_get_capturable_supported_devices_get_scalar_dtype _get_value _maximize_doc _params_doc_stack_if_compiling _to_scalar_use_grad_for_differentiable _view_as_real OptimizerParamsTNAdamnadamceZdZ dddddddedeeefdeeefdeded ed ed e ed ed edeffdZ fdZ dZ e ddZxZS)rFN)foreachmaximize capturabledifferentiableparamslrbetaseps weight_decaymomentum_decaydecoupled_weight_decayrrrrc t|tr|jdk7r tdd|kstd|d|kstd|d|dcxkrdksntd|dd|dcxkrdksntd |dd|kstd |d|kstd |||||||| || | d } t ||| y) NrzTensor lr must be 1-elementzInvalid learning rate: zInvalid epsilon value: r?z#Invalid beta parameter at index 0: z#Invalid beta parameter at index 1: zInvalid weight_decay value: zInvalid momentum_decay value: ) r r!r"r#r$r%rrrr) isinstancernumel ValueErrorsuper__init__)selfrr r!r"r#r$r%rrrrdefaults __class__s W/mnt/ssd/data/python-lab/Trading/venv/lib/python3.12/site-packages/torch/optim/nadam.pyr-zNAdam.__init__!s b& !bhhjAo:; ;by6rd;< <cz6se<= =eAh$$B58*MN NeAh$$B58*MN Nl";L>JK Kn$=n=MNO O(,&< $,   *c8t|||jD]z}|jdd|jdd|jdd|jdd|jdd|dD]}|jj |g}t |dk7s/tj|d s_t|d }|dr*tj|t|j ntj|t |d <tj|d r|d }|dr*tj|t|j ntj|t |d <}y) NrFrrrr%rrstepdtypedevicer6 mu_product) r, __setstate__ param_groups setdefaultstategetlentorch is_tensorfloattensorrr7)r.r=grouppp_statestep_val mu_prod_valr0s r1r:zNAdam.__setstate__Lsn U#&& E   Z /   Y -   \5 1   -u 5   5u =8_ **..B/w<1$ ??76?;#(#9 %\2"LL (0A0CAHH"'h>O>Q!R  !??7<+@A&-l&; %\2"LL +3D3Fqxx"'kARAT!U  -   r2cd}|dD]} | j|tj| z}|j| | jjr t d|j| j|j | } t| dk(r|dr*tjdt| jntjdt | d <|dr*tjdt| jntjd t | d <tj| tj | d<tj| tj | d<|j| d|j| d|j| d |j| d |S)NFrz'NAdam does not support sparse gradientsrrr5r'r8r4r(r9) memory_formatexp_avg exp_avg_sq)gradr@ is_complexappend is_sparse RuntimeErrorr=r?zerosrr7rCones zeros_likepreserve_format) r.rDparams_with_gradgradsexp_avgs exp_avg_sqs mu_products state_steps has_complexrEr=s r1 _init_groupzNAdam._init_groupjs x$ 2Avv!u//22  ''*66##&'PQQ QVV$ 1 u:? !. B.?.A!((S"\\#5F5HI&M!. 2->-@R"\\#5F5HI,' (-'7'7)>)>(E)$+0*:*:)>)>+E,'i 01""5#67""5#67""5=1I$ 2Jr2c|jd}|$tj5|}ddd|jD]}g}g}g}g}g}g} t t t t f|d\} } |j||||||| } t|||||| | | |d|d|d|d|d|d|d |d |d | |S#1swYxYw) zPerform a single optimization step. Args: closure (Callable, optional): A closure that reevaluates the model and returns the loss. Nr!r r#r$r"rr%rrr) beta1beta2r r#r$r"rr%rrrr]) _cuda_graph_capture_health_checkr@ enable_gradr;rtuplerBr^r) r.closurelossrDrWrXrYrZr[r\r`rar]s r1r4z NAdam.steps2 --/  ""$ !y !&&& E-/ "$E%'H(*K(*K(*KeUl 3U7^DLE5** K  ;">2$%56%Lz*',-E'Fi( .$%56'% '& P W ! !s C  C)gMb`?)g?g+?g:0yE>rgMbp?FN)__name__ __module__ __qualname__rrrBrrdboolrr-r:r^rr4 __classcell__)r0s@r1rr s$(%1 $',)+#' $)+)+ %- )+UE\" )+  )+  )+)+!%)+$)+)+)+)+V<0d"6"6r2a Implements NAdam algorithm. .. math:: \begin{aligned} &\rule{110mm}{0.4pt} \\ &\textbf{input} : \gamma_t \text{ (lr)}, \: \beta_1,\beta_2 \text{ (betas)}, \: \theta_0 \text{ (params)}, \: f(\theta) \text{ (objective)} \\ &\hspace{13mm} \: \lambda \text{ (weight decay)}, \:\psi \text{ (momentum decay)} \\ &\hspace{13mm} \: \textit{decoupled\_weight\_decay}, \:\textit{maximize} \\ &\textbf{initialize} : m_0 \leftarrow 0 \text{ ( first moment)}, v_0 \leftarrow 0 \text{ ( second moment)} \\[-1.ex] &\rule{110mm}{0.4pt} \\ &\textbf{for} \: t=1 \: \textbf{to} \: \ldots \: \textbf{do} \\ &\hspace{5mm}\textbf{if} \: \textit{maximize}: \\ &\hspace{10mm}g_t \leftarrow -\nabla_{\theta} f_t (\theta_{t-1}) \\ &\hspace{5mm}\textbf{else} \\ &\hspace{10mm}g_t \leftarrow \nabla_{\theta} f_t (\theta_{t-1}) \\ &\hspace{5mm} \theta_t \leftarrow \theta_{t-1} \\ &\hspace{5mm} \textbf{if} \: \lambda \neq 0 \\ &\hspace{10mm}\textbf{if} \: \textit{decoupled\_weight\_decay} \\ &\hspace{15mm} \theta_t \leftarrow \theta_{t-1} - \gamma \lambda \theta_{t-1} \\ &\hspace{10mm}\textbf{else} \\ &\hspace{15mm} g_t \leftarrow g_t + \lambda \theta_{t-1} \\ &\hspace{5mm} \mu_t \leftarrow \beta_1 \big(1 - \frac{1}{2} 0.96^{t \psi} \big) \\ &\hspace{5mm} \mu_{t+1} \leftarrow \beta_1 \big(1 - \frac{1}{2} 0.96^{(t+1)\psi}\big)\\ &\hspace{5mm}m_t \leftarrow \beta_1 m_{t-1} + (1 - \beta_1) g_t \\ &\hspace{5mm}v_t \leftarrow \beta_2 v_{t-1} + (1-\beta_2) g^2_t \\ &\hspace{5mm}\widehat{m_t} \leftarrow \mu_{t+1} m_t/(1-\prod_{i=1}^{t+1}\mu_i)\\[-1.ex] & \hspace{11mm} + (1-\mu_t) g_t /(1-\prod_{i=1}^{t} \mu_{i}) \\ &\hspace{5mm}\widehat{v_t} \leftarrow v_t/\big(1-\beta_2^t \big) \\ &\hspace{5mm}\theta_t \leftarrow \theta_t - \gamma \widehat{m_t}/ \big(\sqrt{\widehat{v_t}} + \epsilon \big) \\ &\rule{110mm}{0.4pt} \\[-1.ex] &\bf{return} \: \theta_t \\[-1.ex] &\rule{110mm}{0.4pt} \\[-1.ex] \end{aligned} For further details regarding the algorithm we refer to `Incorporating Nesterov Momentum into Adam`_. z Args: a lr (float, Tensor, optional): learning rate (default: 2e-3) betas (Tuple[float, float], optional): coefficients used for computing running averages of gradient and its square (default: (0.9, 0.999)) eps (float, optional): term added to the denominator to improve numerical stability (default: 1e-8) weight_decay (float, optional): weight decay (L2 penalty) (default: 0) momentum_decay (float, optional): momentum momentum_decay (default: 4e-3) decoupled_weight_decay (bool, optional): whether to decouple the weight decay as in AdamW to obtain NAdamW. If True, the algorithm does not accumulate weight decay in the momentum nor variance. (default: False) z z .. _Incorporating Nesterov Momentum into Adam: https://openreview.net/forum?id=OM0jvwB8jIp57ZJjtNEZ .. _Decoupled Weight Decay Regularization: https://arxiv.org/abs/1711.05101 rrXrYrZr[r\r`rar r#r$r"r%rrrr]c tjjs t|}t |D]\}}| s||n|| }||}||}||}||}tj |rTtj |}tj |}tj |}tj |}tjjsy|rwt}|jj|jjcxk(r|jjk(rnn|jj|vs Jd|d|dz }|r|}n t|}d||zz }| dk7r-| r|jd|| zz n|j|| }|ddd|| zzzz z}|ddd|dz| zzzz z}||z}|j|d|z |j|j!||d|z |j#|j%}|s|r]|j| }||z}|| d|z zd|z z z}|| |zd|z z z}|j'|||j'||\t||z}|j)| |j'||| d|z zdt|z z |j'||t+t,| |zd|z z  y) NzVIf capturable=True, params, mu_products and state_steps must be on supported devices: .rralphar(?Q?)value)r@jit is_scriptingr enumeraterO view_as_realcompiler is_compilingr r7typermul_addlerp_addcmul_divsqrtaddcdiv_add_rrB)rrXrYrZr[r\r`rar r#r$r"r%rrrr]iparamrNrLrMr9step_tcapturable_supported_devicesr4bias_correction2mumu_nextdenommu_product_nexts r1_single_tensor_nadamrs)( 99 ! ! # ^f%I5'uQxeAhY1+ ^  ^ Q   E "&&u-E%%d+D((1G++J7J~~**,+L+N ( !!Z%6%6%;%;Qv}}?Q?QQLL%%)EE ))E(FaI  F !  Df%Dud{? 1 % 1rL001xx\x:cC4D>,A#BCC D3$(n1L(M!NNO b   dAI&''d!e)'D/0557 ZIIcNE)72OB3#(+sZ/?@AD"w#2G!HIG NN4 ' NN7E *(4w>O JJsO NNeRC38$4j>T8T$U   NN5B3=S?5J"KL  KIr2c  ,t|dk(ry|rJdtjjs8|r6t d,t ,fdt |||Ds Jd,dt|}tj||||||g}|jD]\\}}}}}}}ttt|}ttt|}ttt|}ttt|}ttt|}ttt|}|rt||||| rtj|}tjjs=|dj r.tj"|tj$dd d ntj"|d | dk7rR| rtj&|d || zz n3| rtj"||| ntj(||| }tj*||d |z tj&||tj,|||d |z tj.|}|r4tj0|| } tj2d | }!tj&|!dtj"|!dtj&|!|tj"| | tj2d | }"tj&|"dtj"|"dtj&|"|~ tj2||}#tj4|#dtj6|#tj8|#nr|D$cgc]}$d |t;|$zz dz}#}$|D$cgc]}$|ddd t;|$| zzzz z}!}$|D$cgc]}$|ddd t;|$d z| zzzz z!}"}$tj&||!tj<||#tj"|| ~#|rtj4|!dtj&|!|tj>|d}%tj6|%tj<|!|%|!}&~%tj0||"}%tj&|"|tj4|%dtj<|"|%|"}'~%tj0|&|}(tj,|(|'|tj@||(|tCt ||!D)*cgc](\})}*t;|d|*z zdt;|)z z dz*c}*})}&tCt ||"D)+cgc](\})}+t;||+zdt;|)|+zz z dz*c}+})}'tj@||||&tj@||||'ycc}$wcc}$wcc}$wcc}*})wcc}+})w)Nrz#_foreach ops don't support autogradF) supports_xlac3K|]n\}}}|jj|jjcxk(xr|jjk(ncxr|jjvpywrg)r7rz).0rEmpr4rs r1 z&_multi_tensor_nadam..s` 2t HHMMRYY^^ ?t{{/?/? ? > !== > sA4A7zWIf capturable=True, params, mu_products, and state_steps must be on supported devices: rnr(cpu)r7rorrrgrq)"r?r@rxryr allziprr"_group_tensors_by_device_and_dtypevaluesrlistrr _foreach_negis_cpu _foreach_add_rC _foreach_mul_ _foreach_add_foreach_lerp__foreach_addcmul_ _foreach_sqrt _foreach_mul _foreach_pow _foreach_sub_ _foreach_neg__foreach_sqrt_r _foreach_div_ _foreach_sub_foreach_addcdiv_r)-rrXrYrZr[r\r`rar r#r$r"r%rrrr]grouped_tensorsgrouped_params_grouped_grads_grouped_exp_avgs_grouped_exp_avg_sqs_grouped_mu_products_grouped_state_steps__grouped_params grouped_gradsgrouped_exp_avgsgrouped_exp_avg_sqsgrouped_mu_productsgrouped_state_stepsexp_avg_sq_sqrtexponentmusmu_nextsbias_correction_sqrtr4rstep_size_gradsstep_size_expavg numeratorr9rrrs- @r1_multi_tensor_nadamr|sY( 6{aDDD  >> & & (Z'H( $  #6; D    R+,A /    BBBB +{KHO  " " $k  d6lO<T&\>: V .?@"4<1EF"4<1EF"4<1EF   /?AT  !..}=M ~~**,1DQ1G1N1N   #U\\#e%DC     3Q 7 1 %##NA\8I4IJ''%~\%*$6$6%~\%M -}a%iH /7   q5y  --.AB ))*=~NH$$T84C   T *   S )   U +   . 9))$9H   $ /   # .   % 0#(#5#5e=P#Q    4c :    4 5  !5 6DW$;?Uj...36$ $ 0sdz$/?./P&QRRSC0*T*:Q*>.)P QRRTH /5 O-AB OS1 !    S )   R (&&':C@E    &   U +!O&&':HEE   " -   s +   % 0' **?MJI  # #I/?AQ R  # #NI O1+..A3*G& B ^sRx0C*Z:P4PQUWWO 3033F/Q, G #2!"J!7'!AAC     # #    # #   Mkn$b sY)!Y$Y!-Y" *-Y( )single_tensor_fnrc td|Ds tdtd|Ds td|t|| d\}}|r)tjj r td|r%tjj st }nt}|||||||| | |||| |||| | y) zpFunctional API that performs NAdam algorithm computation. See :class:`~torch.optim.NAdam` for details. c3PK|]}t|tj ywrgr)r@rrts r1rznadam..s@qz!U\\*@$&zPAPI has changed, `state_steps` argument must contain a list of singleton tensorsc3PK|]}t|tj ywrgrrs r1rznadam..xrrzPAPI has changed, `mu_products` argument must contain a list of singleton tensorsNF) use_fusedz6torch.jit.script not supported with foreach optimizers) r`rar r#r$rr%r"rrr])rrRr r@rtrurr)rrXrYrZr[r\r%rrrr]rr`rar r#r$r"rfuncs r1rrWs8 @K@ @ ^   @K@ @ ^  1 Ne 7599))+STTuyy--/"#  !%5 %#r2)FNFFFF) __doc__typingrrrr@r optimizerrr r r r r rrrrrrrrrr__all__rrrBrkrrrrJr2r1rsW.(( ( G sIsn&N       !O> F` L` <`6l`f ` f ` f ` ` ` ``` `!`` !`"#`$%`FX LX <X6lXf X f X f X X X XXX X!XX !X"#X$%Xv 1EF$)" D LD <D6lDf D f D f D!Dd^DDDDD !D" #D$ %D&'D()D* +DGDr2