L i,~ddlmZmZmZddlZddlmZddlmZmZm Z m Z m Z m Z m Z mZmZmZmZmZmZmZmZmZmZmZmZmZddgZGddeZd d ed e d ed ed e d e d ze_deedeedeedeedeedeedeedeedededee efdee efdee efde de dededed ef&d!Z!deedeedeedeedeedeedeedeedededee efdee efdee efde de dededed ef&d"Z"deedeedeedeedeedeedeedeededede de dee efde de dededed ed#df(d$Z#e e!% d)deedeedeedeedeedeed&eededed'eedeedeeded edede de dee efde de def*d(Z$y)*)castOptionalUnionN)Tensor)_capturable_doc_default_to_fused_or_foreach_device_dtype_check_for_fused_differentiable_doc_disable_dynamo_if_unsupported _foreach_doc _fused_doc!_get_capturable_supported_devices_get_scalar_dtype _get_value _maximize_doc _params_doc_stack_if_compiling _to_scalar_use_grad_for_differentiable _view_as_real DeviceDictDeviceDtypeDict OptimizerParamsTAdamadamceZdZ dddddddddedeeefdeeeefeeeffdeded ed e ed ed ed ede edeffdZ fdZ dZ e ddZxZS)rFN)foreachmaximize capturabledifferentiablefuseddecoupled_weight_decayparamslrbetaseps weight_decayamsgradrr r!r"r#r$c t|tr-|r | s td|jdk7r tdd|kstd|d|kstd|d|dcxkrdksntd |dd|dcxkrdksntd |dd|kstd |t|dtrt|dts1t|dtrt|dts td t|dtr0| s |r td |djdk7r tdt|dtr0| s |r td|djdk7r td|||||||| | | | d } t ||| | r"| r tdd|_|r tdyy)NElr as a Tensor is not supported for capturable=False and foreach=TruerTensor lr must be 1-elementzInvalid learning rate: zInvalid epsilon value: r?z#Invalid beta parameter at index 0: z#Invalid beta parameter at index 1: zInvalid weight_decay value: z0betas must be either both floats or both TensorszKbetas[0] as a Tensor is not supported for capturable=False and foreach=Truez!Tensor betas[0] must be 1-elementzKbetas[1] as a Tensor is not supported for capturable=False and foreach=Truez!Tensor betas[1] must be 1-element) r&r'r(r)r*r rr!r"r#r$z)`fused` does not support `differentiable`Tz0`fused` and `foreach` cannot be `True` together.) isinstancer ValueErrornumelfloatsuper__init__ RuntimeError_step_supports_amp_scaling)selfr%r&r'r(r)r*rr r!r"r#r$defaults __class__s V/mnt/ssd/data/python-lab/Trading/venv/lib/python3.12/site-packages/torch/optim/adam.pyr5z Adam.__init__#s b& !z [xxzQ !>??by6rd;< <cz6se<= =eAh$$B58*MN NeAh$$B58*MN Nl";L>JK K a% (Za%-H58V,E!Hf1MOP P eAh '' aQx~~1$ !DEE eAh '' aQx~~1$ !DEE( $,&<   * "#NOO.2D + "#UVV ct|||jD]5}|jdd|jdd|jdd|jdd|jdd|jdd|jdd}|d D]}|jj |g}t |d k7s.tj|d rGt|d }|ds|dr,tj|t| |j ntj|t|d <8y)Nr*Fr rr!r"r$r#r%rstepis_fuseddtypedevicerB) r4 __setstate__ param_groups setdefaultstategetlentorch is_tensorr3tensorrrC)r8rHgroupr#pp_statestep_valr:s r;rEzAdam.__setstate__rs6 U#&& E   Y .   Z /   Y -   \5 1   -u 5   5u =$$Wd3E8_ **..B/w<1$U__WV_-M$WV_5H!.%.  $"3U"C#$88 #\\(:K:MNFO  r<cRd}|dD]} | j|tj| z}|j| | jjr t d|j| j|j | } t| dk(r|dr t| |ds|dr/tjdt|d| j ntjd t | d <tj| tj | d<tj| tj | d<|dr(tj| tj | d<|j| d|j| d|dr|j| d|dr| d jr t d|dr(tj |dr|ds t d|j| d |S)NFr%zJAdam does not support sparse gradients, please consider SparseAdam insteadrr#r!r?rAr.rDr>) memory_formatexp_avg exp_avg_sqr*max_exp_avg_sqr"zB`requires_grad` is not supported for `step` in differentiable moderr&r,)gradrK is_complexappend is_sparser6rHrJr zerosrrCrM zeros_likepreserve_format requires_gradrL) r8rNparams_with_gradgradsexp_avgs exp_avg_sqsmax_exp_avg_sqs state_steps has_complexrOrHs r; _init_groupzAdam._init_groups x= 2Avv!u//22  ''*66##&d QVV$ 1 u:?W~5a8!.%.  "3U7^"L#$88 #\\#5F5HI&M(-'7'7)>)>(E)$+0*:*:)>)>+E,'Y'272B2BU-B-B3./i 01""5#67##**51A+BC)*uV}/J/J&\ )$d 4!,/&_""5=1{= 2|r<c|jd}|$tj5|}ddd|jD]}g}g}g}g}g}g} |d\} } |j ||||||| } t |||||| f|d| | | |d|d|d|d|d|d |d |d t |d dt |d d|dd|S#1swYxYw)zPerform a single optimization step. Args: closure (Callable, optional): A closure that reevaluates the model and returns the loss. Nr'r*r&r)r(r rr!r"r# grad_scale found_infr$)r*rfbeta1beta2r&r)r(r rr!r"r#rirjr$) _cuda_graph_capture_health_checkrK enable_gradrFrgrgetattr) r8closurelossrNr`rarbrcrdrerkrlrfs r;r>z Adam.stepsG --/  ""$ !y !&&) E-/ "$E%'H(*K,.O(*K >LE5** K    i(';">2%Lz*i( .$%56Gn"4t<!$ T:',-E'F+ ') V ] ! !s C  C)gMbP?)g?g+?g:0yE>rFN)__name__ __module__ __qualname__rrr3rtupleboolrr5rErgrr> __classcell__)r:s@r;rr"s$(COMW#' $ $',MWMW %- MWU5&=)5+??@ MW  MW  MWMW$MWMWMWMW~MW!%MW^0IV"9"9r<af Implements Adam algorithm. .. math:: \begin{aligned} &\rule{110mm}{0.4pt} \\ &\textbf{input} : \gamma \text{ (lr)}, \beta_1, \beta_2 \text{ (betas)},\theta_0 \text{ (params)},f(\theta) \text{ (objective)} \\ &\hspace{13mm} \lambda \text{ (weight decay)}, \: \textit{amsgrad}, \:\textit{maximize}, \: \epsilon \text{ (epsilon)} \\ &\textbf{initialize} : m_0 \leftarrow 0 \text{ ( first moment)}, v_0\leftarrow 0 \text{ (second moment)},\: v_0^{max}\leftarrow 0 \\[-1.ex] &\rule{110mm}{0.4pt} \\ &\textbf{for} \: t=1 \: \textbf{to} \: \ldots \: \textbf{do} \\ &\hspace{5mm}\textbf{if} \: \textit{maximize}: \\ &\hspace{10mm}g_t \leftarrow -\nabla_{\theta} f_t (\theta_{t-1}) \\ &\hspace{5mm}\textbf{else} \\ &\hspace{10mm}g_t \leftarrow \nabla_{\theta} f_t (\theta_{t-1}) \\ &\hspace{5mm}\textbf{if} \: \lambda \neq 0 \\ &\hspace{10mm} g_t \leftarrow g_t + \lambda \theta_{t-1} \\ &\hspace{5mm}m_t \leftarrow \beta_1 m_{t-1} + (1 - \beta_1) g_t \\ &\hspace{5mm}v_t \leftarrow \beta_2 v_{t-1} + (1-\beta_2) g^2_t \\ &\hspace{5mm}\widehat{m_t} \leftarrow m_t/\big(1-\beta_1^t \big) \\ &\hspace{5mm}\textbf{if} \: amsgrad \\ &\hspace{10mm} v_t^{max} \leftarrow \mathrm{max}(v_{t-1}^{max},v_t) \\ &\hspace{10mm}\widehat{v_t} \leftarrow v_t^{max}/\big(1-\beta_2^t \big) \\ &\hspace{5mm}\textbf{else} \\ &\hspace{10mm}\widehat{v_t} \leftarrow v_t/\big(1-\beta_2^t \big) \\ &\hspace{5mm}\theta_t \leftarrow \theta_{t-1} - \gamma \widehat{m_t}/ \big(\sqrt{\widehat{v_t}} + \epsilon \big) \\ &\rule{110mm}{0.4pt} \\[-1.ex] &\bf{return} \: \theta_t \\[-1.ex] &\rule{110mm}{0.4pt} \\[-1.ex] \end{aligned} For further details regarding the algorithm we refer to `Adam: A Method for Stochastic Optimization`_. z Args: a lr (float, Tensor, optional): learning rate (default: 1e-3). A tensor LR is not yet supported for all our implementations. Please use a float LR if you are not also specifying fused=True or capturable=True. betas (Tuple[float, float], optional): coefficients used for computing running averages of gradient and its square (default: (0.9, 0.999)) eps (float, optional): term added to the denominator to improve numerical stability (default: 1e-8) weight_decay (float, optional): weight decay (L2 penalty) (default: 0) decoupled_weight_decay (bool, optional): if True, this optimizer is equivalent to AdamW and the algorithm will not accumulate weight decay in the momentum nor variance. (default: False) amsgrad (bool, optional): whether to use the AMSGrad variant of this algorithm from the paper `On the Convergence of Adam and Beyond`_ (default: False) z a= .. Note:: A prototype implementation of Adam and AdamW for MPS supports `torch.float32` and `torch.float16`. .. _Adam\: A Method for Stochastic Optimization: https://arxiv.org/abs/1412.6980 .. _On the Convergence of Adam and Beyond: https://openreview.net/forum?id=ryQu7f-RZ r%rarbrcrdrerirjr*rfrkrlr&r)r(r r!r"r$c T ||Jtjjr6t| tsJt| tsJt| ts Jt | } t| t r| j| jf| i}nd}t|D]\}}|s||n|| }||}||}||}tjjs\|rZt}|jj|jjk(r|jj|vs Jd|d|dz }| dk7r|r|jd| | zz nf|rQt| t rA| jr!|j!|j#| }n'|j%|| }n|j%|| }tj&|rqtj(|}tj(|}tj(|}|rtj(||||<tj(|}|j}|1|j}||f}||vr| j+||d||<||}n| }|j-|d|z |r{t| t rk| jr*|j-tj.|d| z n[|j| j!||t1td| z  n&|j| j!||d| z |s|r|}|rbias_correction1bias_correction2 step_size step_size_negbias_correction2_sqrtrWdenoms' r;_single_tensor_adamrYs,  )"33 3 yy"e$$$%'''%''' ^% 27,, 1Le0T  f%XK5'uQxeAhY1+ ^ Q~~**,+L+N ( !!V]]%7%77LL%%)EE \\x[yyz{  F !  1 % 1rL001"jv&F#11#}}U[[]LI#xx\xB88E8>D   E "%%d+D((1G++J7J%*%7%78J%K"&&u-E  !KKE5/C*$"'((!T#+# 32)NO$s]*+w}}6w.f%D 5$;  5$; --I$4c$9 ! oa0*/RSBTU)+0025JJPPQTU#*-BBHHM NN7E)N < u''q 2!&!6!6q7I!JOA qXKr<c  -t|dk(ryt| tr+|s td| j dk7r t dt| tr+|s t d| j dk7r t dt| tr+|s t d| j dk7r t dt jjs7|r5td -t-fd t||Ds Jd -d ||J|rJdt| } tj||||||g}t| tr&t| j dk7r| j | ind}|j#D]\\}}}}}}}t%t&t|}t%t&t|}t%t&t|}t%t&t|}t%t&t|} |dj }!||!|vr| j)|!d||!<|r||!n| }"| r7|r't%t&t|}#t+|||||#nt+|||||rt j,|}t jjs=| dj.r.t j0| t j2dddnt j0| d| dk7rR|rt j4|d| | zz n3|rt j0||| nt j6||| }t j8||t%t:d|"z t j4|| t| t jrt j<|d| z }$d}%n|}$d| z }%t j>||$||%~~$|rft j@| | }&t j@| | }'t jB|&dt jB|'dt jD|'t jF|&| t jH|&t jJ|'|&}(|'})|rCt%t&t|}#t jL|#|t jN|#}*nt jN|}*t jF|*|)t j0|*|t jF|*|(t jP|||*| D+cgc]}+d| tS|+zz }&}+| D+cgc]}+d| tS|+zz }'}+tU|&D,cgc] },| |,z dz c},}(|'D,cgc]},|,dz })},|rCt%t&t|}#t jL|#|t jN|#}*nt jN|}*t jF|*|)t j0|*|t jP|||*|(ycc}+wcc}+wcc},wcc},w)Nrr,rr-zHbeta1 as a Tensor is not supported for capturable=False and foreach=TruezTensor beta1 must be 1-elementzHbeta2 as a Tensor is not supported for capturable=False and foreach=TruezTensor beta2 must be 1-elementF) supports_xlac3K|]N\}}|jj|jjk(xr|jjvPywrr)rCr).0rOr>rs r; z%_multi_tensor_adam..VsQ 4 HHMMT[[-- - > !== > sAArzr{z#_foreach ops don't support autogradcpuTrCr~r/)rCr|r)+rJr0rr6r2r1rKrrrallziprr"_group_tensors_by_device_and_dtypestrrCvaluesrlistrr _foreach_negis_cpu _foreach_add_rM _foreach_mul_ _foreach_add_foreach_lerp_r3 _foreach_mul_foreach_addcmul_ _foreach_pow _foreach_sub_ _foreach_neg_ _foreach_div__foreach_reciprocal__foreach_sqrt__foreach_maximum_ _foreach_sqrt_foreach_addcdiv_rr).r%rarbrcrdrerirjr*rfrkrlr&r)r(r r!r"r$grouped_tensorsrdevice_params_ device_grads_device_exp_avgs_device_exp_avg_sqs_device_max_exp_avg_sqs_device_state_steps__ device_params device_gradsdevice_exp_avgsdevice_exp_avg_sqsdevice_state_stepsrCrdevice_max_exp_avg_sqsscaled_device_gradsrrrrrexp_avg_sq_sqrtr>bcrs. @r;_multi_tensor_adamr sZ, 6{a"fW  88:?:; ;% Z  ;;=A => >% Z  ;;=A => > >> & & (Z'H( $ v{3   XXtWuuv w     )"33 3DDD  BB BB + LO eV $U\\):e)C u   " " $a  T&\>: DL-8 tF|-=>!$v,0CD!$v,0CDq!((  !fJ&>!&d!KJv -7z&)U  )-d6l@RS#("5"56L"M"'"5"56H"I   1F G    5    ;  # #M?O T;M 26EZ---   ;M 26EZ---   ,FV,Wb2g^,WXI7G$HRW$H !$H)-d6l@RS#("5"56L"M"'"5"56H"I   1F G    5  # #  yaJ  -X$Hs#Y.Y3$Y8 > Y=returnc |sy|r td||j|ini}||j|ini}t| tr&t | jdk7r| j| ind}t j ||||||g}|jD]l\\}}\\}}}}}}}ttt|}ttt|} ttt|}!ttt|}"ttt|}#d\}$}%|#|j||j|d}$|#|j||j|d}%|||vr| j|d||<||} tj|#d|stjntj}&|&|| |!|"||#|| | | | |||$|%|%Jtj |#|%gt#|#zoy) Nz9Adam with fused=True does not support differentiable=Truer)NNT)r~rr) r*r&rkrlr)r(r rirj)r6rCr0rrrritemsrrrGrrKr _fused_adam_ _fused_adamw_rrJ)'r%rarbrcrdrerirjr*rfrkrlr&r)r(r r!r"r$grad_scale_dictfound_inf_dictlr_dictrrCrrrrrrrrrrrrdevice_grad_scaledevice_found_inffuncs' r; _fused_adamrsK, VWW,6+A  J'r*3)>  9%B&b&1c"))n6MBSW  BB + LO    3        "  T&\>: DL-8 tF|-=>!$v,0CD!$v,0CD.8++  ! / : : f4 @!   -88 V$ ?    6#8 ee6eEGFOB .2)?u!!UEXEX     " %(& "  '   "%5$6=O9P$P c3r<)single_tensor_fnrr#ch| )|'t||d\}}|rt|tr|sd}| d} |d}tjj st d|Ds td|r)tjjr td| r)tjjr td| r%tjjst}n-|r%tjjst}nt}|||||||f|| ||||||||| | | d y) znFunctional API that performs Adam algorithm computation. See :class:`~torch.optim.Adam` for details. NF) use_fusedc3PK|]}t|tj ywrr)r0rKr)rts r;rzadam..s 5() 1ell#5s$&zPAPI has changed, `state_steps` argument must contain a list of singleton tensorsz6torch.jit.script not supported with foreach optimizersz4torch.jit.script not supported with fused optimizers) r*rfrkrlr&r)r(r r!r"rirjr$) r r0rrKrrrr6rrrrr)r%rarbrcrdrerr!r"r#rirjrfr$r*rkrlr&r)r(r rrs r;rrus=F }1 Ne 7 z"f-jG } >> & & (5-852 ^  599))+STT '')QRR UYY++- //1!"   ! %5'r<)NFFNNNFF)%typingrrrrKr optimizerrr r r r r rrrrrrrrrrrrrr__all__r__doc__rrwr3rrrrrSr<r;rs6(( 0 6 m9mb$J         +KB NDK LDK <DK6lDKf DK &\ DK f DK DKDKDKDK  DK  DK eVmDKDK !DK"#DK$%DK&'DK(!)DKNr Lr <r6lrf r &\ r f r rrrr  r  r eVmrr !r"#r$%r&'r(!)rj] L] <]6l]f ] &\ ] f ] ]]]] ] ] eVm]] !]"#]$%]&'](!)]* +]@ 1DE#  #'"&#(!W LW <W6lWf W &\ W f Wd^WWW D>W WWW !!W$%W& 'W( )W* eVm+W,-W. /W01WFWr<