L iJK K( $,   *c0t|||jD]}|jdd|jdd|jdd|jdd|dD]}|jj |g}t |dk7s.tj|drGt|d}|dr*tj|t|j ntj|t |d<y) Nr"rFrrrrstepdtypedevicer2) r( __setstate__ param_groups setdefaultstategetlentorch is_tensorfloattensorrr3)r*r8grouppp_statestep_valr,s r-r5zAdadelta.__setstate__As U#&& E   Y -   Z /   -u 5   \5 18_ **..B/w<1$U__WV_-M$WV_5H !. $,=,?#\\(:K:MN FO   r.r?params_with_gradgrads square_avgs acc_deltas state_stepscd}|dD]o}|j|tj|z}|j||jjr t d|j|j|j |} t| dk(r|dr*tjdt|jntjdt| d <tj|tj | d <tj|tj | d <|j| d |j| d |j| d r|S) NFrz*Adadelta does not support sparse gradientsrrr1r4r0) memory_format square_avg acc_delta) gradr; is_complexappend is_sparse RuntimeErrorr8r:zerosrr3 zeros_likepreserve_format) r*r?rCrDrErFrG has_complexr@r8s r- _init_groupzAdadelta._init_groupTsS x .Avv~ 5++A. .K  # #A &vv"#OPP LL JJqME5zQ\*KK*;*=ahhOR/@/BCf ',&6&6U%:%:'l#&+%5%5U%:%:&k"   u\2 3   eK0 1   uV} -9 .<r.cx|jd}|$tj5|}ddd|jD]f}g}g}g}g}g}|d|d|d|d|d|d|d|d f\} } } } } }}}|j ||||||}t |||||| | | | | |||| h|S#1swYxYw) zPerform a single optimization step. Args: closure (Callable, optional): A closure that reevaluates the model and returns the loss. Nrrr r!r"rrr) rrr r!r"rrrrU) _cuda_graph_capture_health_checkr; enable_gradr6rVr)r*closurelossr?rCrDrErFrGrrr r!r"rrrrUs r-r0z Adadelta.steps3 --/  ""$ !y !&&- E-/ "$E(*K')J(*Kd e e n%i j!&'l#  **' ZK  )!-%' =- ^ e ! !s B00B9)r$g?gư>rNN)__name__ __module__ __qualname__rrr=rrboolr)r5dictstrrlistrVrr0 __classcell__)r,s@r-rrs$'"&"+!$"+"+ %- "+ "+  "+  "+$"+"+"+"+H&)CH~)v,)F| ) &\ ) L )&\)V"="=r.aImplements Adadelta algorithm. .. math:: \begin{aligned} &\rule{110mm}{0.4pt} \\ &\textbf{input} : \gamma \text{ (lr)}, \: \theta_0 \text{ (params)}, \: f(\theta) \text{ (objective)}, \: \rho \text{ (decay)}, \: \lambda \text{ (weight decay)} \\ &\textbf{initialize} : v_0 \leftarrow 0 \: \text{ (square avg)}, \: u_0 \leftarrow 0 \: \text{ (accumulate variables)} \\[-1.ex] &\rule{110mm}{0.4pt} \\ &\textbf{for} \: t=1 \: \textbf{to} \: \ldots \: \textbf{do} \\ &\hspace{5mm}g_t \leftarrow \nabla_{\theta} f_t (\theta_{t-1}) \\ &\hspace{5mm}if \: \lambda \neq 0 \\ &\hspace{10mm} g_t \leftarrow g_t + \lambda \theta_{t-1} \\ &\hspace{5mm} v_t \leftarrow v_{t-1} \rho + g^2_t (1 - \rho) \\ &\hspace{5mm}\Delta x_t \leftarrow \frac{\sqrt{u_{t-1} + \epsilon }}{ \sqrt{v_t + \epsilon} }g_t \hspace{21mm} \\ &\hspace{5mm} u_t \leftarrow u_{t-1} \rho + \Delta x^2_t (1 - \rho) \\ &\hspace{5mm}\theta_t \leftarrow \theta_{t-1} - \gamma \Delta x_t \\ &\rule{110mm}{0.4pt} \\[-1.ex] &\bf{return} \: \theta_t \\[-1.ex] &\rule{110mm}{0.4pt} \\[-1.ex] \end{aligned} For further details regarding the algorithm we refer to `ADADELTA: An Adaptive Learning Rate Method`_. z Args: ar lr (float, Tensor, optional): coefficient that scale delta before it is applied to the parameters (default: 1.0) rho (float, optional): coefficient used for computing a running average of squared gradients (default: 0.9). A higher value of `rho` will result in a slower average, which can be helpful for preventing oscillations in the learning process. eps (float, optional): term added to the denominator to improve numerical stability (default: 1e-6). weight_decay (float, optional): weight decay (L2 penalty) (default: 0) z zd .. _ADADELTA\: An Adaptive Learning Rate Method: https://arxiv.org/abs/1212.5701 rrDrErFrGrrr r!rrrrUctjjs7| r5tdt fdt ||Ds Jddtj js t|}t |||||D]{\} }}}}|dz }| s|n| }|dk7r|j| |}tj| r?tj|}tj|}tj|}|j|j||d|z |j|j}|j|j}| r|j}|j!|j||j|j||d|z tj| rtj"|}| j%|| ~y) NF supports_xlac3K|]N\}}|jj|jjk(xr|jjvPywr\r3type.0r@r0capturable_supported_devicess r- z*_single_tensor_adadelta.. Q 4 HHMMT[[-- - > !== > AAIIf capturable=True, params and state_steps must be on supported devices: .rralphavalue)r;compiler is_compilingrallzipjit is_scriptingraddrN view_as_realmul_addcmul_sqrt_clonediv_view_as_complexadd_)rrDrErFrGrrr r!rrrrUparamrMrKrLr0stddeltarms @r-_single_tensor_adadeltars" >> & & (Z'H( $ v{3   XXtWuuv w    99 ! ! # ^47{J 5%0tZD  #t$ 1 88E86D   E "++J7J**95I%%d+D%%dDC%@nnS!'') c"((* KKME 3T"s$$UES$A   E "))%0E 5 $1%r.c| rJdtjjs7| r5tdt fdt ||Ds Jddt |dk(ryt|}tj|||||g} | jD]\\}}}}}}ttt|}ttt|}ttt|}ttt|}ttt|}| rt||||tjjs=|djr.tj |tj"dd d ntj |d | rtj$|}|dk7r3| rtj ||| ntj&||| }tj(||tj*|||d |z tj&||}tj,|tj&||}tj,|tj.||tj(||tj(||tj*|||d |z | rIt1|tjr/tj(|| tj ||tj ||| y)Nz#_foreach ops don't support autogradFrfc3K|]N\}}|jj|jjk(xr|jjvPywr\rirks r-rnz)_multi_tensor_adadelta..Grorprqrrrr$cpu)r3rsrru)r;rwrxrryrzr:rr"_group_tensors_by_device_and_dtypevaluesrrcrris_cpu _foreach_add_r> _foreach_neg _foreach_add _foreach_mul__foreach_addcmul__foreach_sqrt_ _foreach_div_r%)rrDrErFrGrrr r!rrrrUgrouped_tensorsdevice_params_ device_grads_device_square_avgs_device_acc_deltas_device_state_steps__ device_params device_gradsdevice_square_avgsdevice_acc_deltasdevice_state_stepsrdeltasrms @r-_multi_tensor_adadeltar0s DDD  >> & & (Z'H( $ v{3   XXtWuuv w    6{a BBBB  Z=O  " " $ >B  T&\>: DL-8 !$v,0CD f/AB!$v,0CD  |-?AR ~~**,1CA1F1M1M   "ELLU$C3     2A 6  --l;L 1 ##L-|T$11 -|   .4   l!c'   !3S9 S!##$5s; V$ FC( FL1 -s3  166SQ *R6    ,    v 6    vbS A}>Br.)single_tensor_fnr"c ztjjstd|Ds t d|t ||d\}}|r)tj jr t d|r%tj jst}nt}||||||| | | | | ||| y)zvFunctional API that performs Adadelta algorithm computation. See :class:`~torch.optim.Adadelta` for details. c3PK|]}t|tj ywr\)r%r;r)rlts r-rnzadadelta..s 5() 1ell#5s$&zPAPI has changed, `state_steps` argument must contain a list of singleton tensorsNF) use_fusedz6torch.jit.script not supported with foreach optimizers)rrr r!rrrrU) r;rwrxryrQr r{r|rr)rrDrErFrGrr"rrUrrr r!rrfuncs r-rrs6 >> & & (5-852 ^  1 Ne 7599))+STTuyy--/%&   !%r.)FNFF)typingrrrrr;r optimizerr r r r r rrrrrrrrr__all__r__doc__rcr=r`rrrrIr.r-rs-- $ z "ayaJ8       90 j8% L8% <8%f8%V 8% f 8% 8% 8% 8%8%8%8%8%8%veB LeB <eBfeBV eB f eB eB eB eBeBeBeBeBeBP 1HI" = L= <=f=V = f ==d^=== = = = !="#=J=r.