L iC ddlmZmZmZddlZddlmZddlmZmZm Z m Z m Z m Z m Z mZmZmZmZmZmZmZmZddgZGddeZd d ed e d ed e d ed ze_deedeedeedeedeedededededededededefdZdeedeedeedeedeedededededededededefdZe e d!deedeedeedeedeedeedededededededededefd Zy)")castOptionalUnionN)Tensor)_capturable_doc_default_to_fused_or_foreach_differentiable_doc_disable_dynamo_if_unsupported _foreach_doc!_get_capturable_supported_devices_get_scalar_dtype _get_value _maximize_doc _params_doc _to_scalar_use_grad_for_differentiable _view_as_real OptimizerParamsTAdamaxadamaxceZdZ ddddddedeeefdeeefdededee d e d e d e ffd Z fd Z dZ e ddZxZS)rF)maximizedifferentiable capturableparamslrbetaseps weight_decayforeachrrrc t|tr|jdk7r tdd|kstd|d|kstd|d|dcxkrdksntd|dd|dcxkrdksntd |dd|kstd ||||||||| d } t ||| y) NrzTensor lr must be 1-elementzInvalid learning rate: zInvalid epsilon value: r?z#Invalid beta parameter at index 0: z#Invalid beta parameter at index 1: zInvalid weight_decay value: )rrr r!r"rrr) isinstancernumel ValueErrorsuper__init__) selfrrrr r!r"rrrdefaults __class__s X/mnt/ssd/data/python-lab/Trading/venv/lib/python3.12/site-packages/torch/optim/adamax.pyr*zAdamax.__init__s b& !bhhjAo:; ;by6rd;< <cz6se<= =eAh$$B58*MN NeAh$$B58*MN Nl";L>JK K( ,$   *c0t|||jD]}|jdd|jdd|jdd|jdd|dD]}|jj |g}t |dk7s.tj|drGt|d}|dr*tj|t|j ntj|t |d<y) Nr"rFrrrrstepdtypedevicer3) r) __setstate__ param_groups setdefaultstategetlentorch is_tensorfloattensorrr4)r+r9grouppp_statestep_valr-s r.r6zAdamax.__setstate__Ds U#&& E   Y -   Z /   -u 5   \5 18_ **..B/w<1$U__WV_-M$WV_5H !. $,=,?#\\(:K:MN FO   r/cd}|dD]o}|j|tj|z}|j||jjr t d|j|j|j |} t| dk(r|dr*tjdt|jntjdt | d <tj|tj | d <tj|tj | d <|j| d |j| d |j| d r|S)NFrz(Adamax does not support sparse gradientsrrr2r$r5r1) memory_formatexp_avgexp_inf)gradr< is_complexappend is_sparse RuntimeErrorr9r;zerosrr4r? zeros_likepreserve_format) r+r@params_with_gradgradsexp_avgsexp_infs state_steps has_complexrAr9s r. _init_groupzAdamax._init_groupWsO x .Avv~ 5++A. .K  # #A &vv"#MNN LL JJqME5zQ\*KK*;*=ahhOc1B1DEf $)#3#3U%:%:$i $)#3#3U%:%:$i  OOE), - OOE), -   uV} -7 .:r/cz|jd}|$tj5|}ddd|jD]g}g}g}g}g}g}|d\} } |d} |d} |d} |d}|d}|d}|d }|j ||||||}t |||||| | | | | ||||| i|S#1swYxYw) zPerforms a single optimization step. Args: closure (Callable, optional): A closure that reevaluates the model and returns the loss. Nrr rr!r"rrr) r beta1beta2rr!r"rrrrV) _cuda_graph_capture_health_checkr< enable_gradr7rWr)r+closurelossr@rQrRrSrTrUrYrZr rr!r"rrrrVs r.r1z Adamax.stepzs$ --/  ""$ !y !&&$ E-/ "$E%'H%'H(*K >LE5,CtB 0LI&GZ(H"#34N|,J**'(KK  )!-%' )$ L S ! !s B11B:)gMb`?)g?g+?g:0yE>rNN)__name__ __module__ __qualname__rrr>rtuplerboolr*r6rWrr1 __classcell__)r-s@r.rrs$(%1"&$+$ $+$+ %- $+UE\" $+  $+  $+$$+$+$+$+L&!F"4"4r/aImplements Adamax algorithm (a variant of Adam based on infinity norm). .. math:: \begin{aligned} &\rule{110mm}{0.4pt} \\ &\textbf{input} : \gamma \text{ (lr)}, \beta_1, \beta_2 \text{ (betas)},\theta_0 \text{ (params)},f(\theta) \text{ (objective)}, \: \lambda \text{ (weight decay)}, \\ &\hspace{13mm} \epsilon \text{ (epsilon)} \\ &\textbf{initialize} : m_0 \leftarrow 0 \text{ ( first moment)}, u_0 \leftarrow 0 \text{ ( infinity norm)} \\[-1.ex] &\rule{110mm}{0.4pt} \\ &\textbf{for} \: t=1 \: \textbf{to} \: \ldots \: \textbf{do} \\ &\hspace{5mm}g_t \leftarrow \nabla_{\theta} f_t (\theta_{t-1}) \\ &\hspace{5mm}if \: \lambda \neq 0 \\ &\hspace{10mm} g_t \leftarrow g_t + \lambda \theta_{t-1} \\ &\hspace{5mm}m_t \leftarrow \beta_1 m_{t-1} + (1 - \beta_1) g_t \\ &\hspace{5mm}u_t \leftarrow \mathrm{max}(\beta_2 u_{t-1}, |g_{t}|+\epsilon) \\ &\hspace{5mm}\theta_t \leftarrow \theta_{t-1} - \frac{\gamma m_t}{(1-\beta^t_1) u_t} \\ &\rule{110mm}{0.4pt} \\[-1.ex] &\bf{return} \: \theta_t \\[-1.ex] &\rule{110mm}{0.4pt} \\[-1.ex] \end{aligned} For further details regarding the algorithm we refer to `Adam: A Method for Stochastic Optimization`_. z Args: a lr (float, Tensor, optional): learning rate (default: 2e-3) betas (Tuple[float, float], optional): coefficients used for computing running averages of gradient and its square eps (float, optional): term added to the denominator to improve numerical stability (default: 1e-8) weight_decay (float, optional): weight decay (L2 penalty) (default: 0) z zd .. _Adam\: A Method for Stochastic Optimization: https://arxiv.org/abs/1412.6980 rrRrSrTrUr rYrZrr!rrrrVc >tjjs t|}t |D]e\}}||}| s|n| }||}||}||}tj j s\| rZt}|jj|jjk(r|jj|vs Jd|d|dz }| dk7r|j|| }tj|rTtj|}tj|}tj|}tj|}|j|d|z | sEtj|j||j!j#||ntj$|j|j'd|j!j#|j)dgd}|j+tj,|dd| r2||zdz }|j/|||z}|j1||;d|t3|zz }||z }|j1||| hy) NIIf capturable=True, params and state_steps must be on supported devices: .rralpha)outF)keepdim)value)r<jit is_scriptingr enumeratecompiler is_compilingr r4typeaddrJ view_as_reallerp_maximummul_absadd_cat unsqueeze unsqueeze_copy_amaxdiv_addcdiv_r)rrRrSrTrUr rYrZrr!rrrrViparamrIrGrHstep_tcapturable_supported_devicesnorm_bufneg_bias_correctiondenombias_correctionclrs r._single_tensor_adamaxrs\" 99 ! ! # ^f%895Qx#t$1+1+Q~~**,+L+N ( !!V]]%7%77LL%%)EE \\x[yyz{  F !  1 88E86D   E "&&u-E%%d+D((1G((1G  dAI& MM U# $  yye$..q1488:??33G3R3RST3UVH MM%**Xq%@ A #(-!"3   $ $R (11E NN7E *%:f+="==O&C NN7GC4N 8q89r/c | rJdt|dk(rytjjs7| r5t dt fdt ||Ds Jddt|}tj|||||g}|jD]\\}}}}}}ttt|}ttt|}ttt|}ttt|}ttt|}| rt||||| rtj|}tjjs=|dj r.tj"|tj$dd d ntj"|d | dk7r3| rtj"||| ntj&||| }tj(||d |z tj*||| s| dk(rtj,|}ntj.|tj"||tj0||| rqtj2||}tj4|d tj6||tj8||}tj:|||Q|Dcgc]}d |t=|zz }}|Dcgc]}t=||z d z}}tj:||||ycc}wcc}w)Nz#_foreach ops don't support autogradrF) supports_xlac3K|]N\}}|jj|jjk(xr|jjvPywr_)r4rs).0rAr1rs r. z'_multi_tensor_adamax..LsQ 4 HHMMT[[-- - > !== > sAArgrhr%cpu)r4rir)r;r<rqrrr allziprr"_group_tensors_by_device_and_dtypevaluesrlistrr _foreach_negis_cpu _foreach_add_r? _foreach_add_foreach_lerp_ _foreach_mul_ _foreach_abs _foreach_abs__foreach_maximum_ _foreach_pow _foreach_sub_ _foreach_div_ _foreach_mul_foreach_addcdiv_r) rrRrSrTrUr rYrZrr!rrrrVgrouped_tensorsgrouped_params_grouped_grads_grouped_exp_avgs_grouped_exp_infs_grouped_state_steps__grouped_params grouped_gradsgrouped_exp_avgsgrouped_exp_infsgrouped_state_stepsbias_correctionsrr1bc step_sizers @r._multi_tensor_adamaxr1s="DDD  6{a >> & & (Z'H( $ v{3   XXtWuuv w    BBBB (K8O  " " $ I  d6lO<T&\>: V .?@V .?@"4<1EF   /?AQ  !..}=M ~~**,1DQ1G1N1N   #U\\#e%DC     3Q 7 1 ##M>V % 2 2!>! -}a%iH ,e4LA-!..}=M    . M3/  0-@ $11%9LM     0! 4    0" 5&&'79IJE  # #N4De L;N 26EZ---   ?OO*R.2-3OIO  # # 02BI OIF Ps 9M M)single_tensor_fnr"c |tjjstd|Ds t d|t ||d\}}|r)tj jr t d|r%tj jst}nt}||||||| | | | |||| |y)zrFunctional API that performs adamax algorithm computation. See :class:`~torch.optim.Adamax` for details. c3PK|]}t|tj ywr_)r&r<r)rts r.rzadamax..s 5() 1ell#5s$&zPAPI has changed, `state_steps` argument must contain a list of singleton tensorsNF) use_fusedz6torch.jit.script not supported with foreach optimizers) r rYrZrr!rrrVr) r<rqrrrrMr rnrorr)rrRrSrTrUr"rrrrVr rYrZrr!rfuncs r.rrs4 >> & & (5-852 ^  1 Ne 7599))+STTuyy--/#$   !%r/)NFFFF)typingrrrr<r optimizerrr r r r r rrrrrrrrr__all__r__doc__rr>rdrrrrEr/r.rs(( & X RYRl4       5+`L9 LL9 <L96lL96l L9 f L9 L9 L9 L9 L9L9L9L9L9L9^q Lq <q6lq6l q f q q q q qqqqqqh 1FG# < L< <<6l<6l < f <d^<<<<< < < !<" #<$%<H<r/