L ixddlZddlmZddlmZgdZdZddZdZ d Z d Z d Z d Z dd ZdZddZddZdeej&dfdeedfdeej&dffdZddZ ddZ ddZddZddZy) N)_vmap) forward_ad)vjpjvpjacobianhessianhvpvhpcbt|tr|St|tr t|S|fSN) isinstancetuplelist)xs _/mnt/ssd/data/python-lab/Trading/venv/lib/python3.12/site-packages/torch/autograd/functional.py_as_tuple_nocheckrs+!U At Qxt c 8| | t|Sd}t|ts|f}d}t|D]c\}}t|tj r!|r!t d|d|d|dt|d t d|d|d|dt|d ||fS) NTFzThe z given to zF must be either a Tensor or a tuple of Tensors but the value at index z has type .z= must be either a Tensor or a tuple of Tensors but the given )rrr enumeratetorchTensor TypeErrortype)inparg_namefn_name is_inp_tupleiels r _as_tupler"sGO %%L c5 !f 3 2"ell+8*Jwi8''(cDH:Q@  8*Jwi8&Zz$r(1>   rct|tr3t|dk(sJ|dstd|D}|ds|d}|S|s|d}|S)Nrc3&K|] }|d yw)rN).0r!s r z%_tuple_postprocess..;s,"1,sr)rrlen)res to_unpacks r_tuple_postprocessr,2sa )U#9~"""|,,,C|a&C Ja&C Jrc<g}|D]}|rY|jrM|js!|j|j|>|j|j ^|j|j j |t|Sr ) requires_grad is_sparseappendview_asclonedetachrequires_grad_r)inputs create_graph need_graphr*rs r_grad_preprocessr8Ds~ C @ C--== 3;;s+, 399;' JJszz|22:> ? @ :rct|dtjrstd|DS|Stfd|DS)Nrc3<K|]}|jywr )r3r'rs rr(z$_grad_postprocess..`s8#8c36K|]}t|ywr )_grad_postprocess)r'rr6s rr(z$_grad_postprocess..dsLc&sL9Ls)rrrr)r5r6s `rr>r>[sA&)U\\*888 8MLVLLLrc t|t|k7r1|r$tdt|dt|dtdtt||D]c\}\}}|j |j k7s+d}|rd|d}t|d|j d|j dy) Nz*v is a tuple of invalid length: should be z but got rz+The given v should contain a single Tensor.zEntry z in zv has invalid size: should be )r) RuntimeErrorrzipsize)votheris_other_tupleidxel_vel_otherprepends r _validate_vrKgs 5zSV >> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD) >>> def exp_reducer(x): ... return x.exp().sum(dim=1) >>> inputs = torch.rand(4, 4) >>> v = torch.ones(4) >>> # xdoctest: +IGNORE_WANT("non-deterministic") >>> vjp(exp_reducer, inputs, v) (tensor([5.7817, 7.2458, 5.7830, 6.7782]), tensor([[1.4458, 1.3962, 1.3042, 1.6354], [2.1288, 1.0652, 1.5483, 2.5035], [2.2046, 1.1292, 1.1432, 1.3059], [1.3225, 1.6652, 1.7753, 2.0152]])) >>> vjp(exp_reducer, inputs, v, create_graph=True) (tensor([5.7817, 7.2458, 5.7830, 6.7782], grad_fn=), tensor([[1.4458, 1.3962, 1.3042, 1.6354], [2.1288, 1.0652, 1.5483, 2.5035], [2.2046, 1.1292, 1.1432, 1.3059], [1.3225, 1.6652, 1.7753, 2.0152]], grad_fn=)) >>> def adder(x, y): ... return 2 * x + 3 * y >>> inputs = (torch.rand(2), torch.rand(2)) >>> v = torch.ones(2) >>> vjp(adder, inputs, v) (tensor([2.4225, 2.3340]), (tensor([2., 2.]), tensor([3., 3.]))) r5rTr6r7%outputs of the user-provided functionrMrSNrDFrrzjThe vector v can only be None if the user-provided function returns a single Tensor with a single element.r6rb)r enable_gradr"r8rTrKr)nelementrAis_grad_enabledset_grad_enabledr`rnr>r,) funcr5rDr6rSis_inputs_tuplerMis_outputs_tuple_rtgrad_resrs rrrstv    "+FHe"D!&|PTU-$- "=!,'$E,A,A,CK    ,M!'61<PXvv|VLM  6G C .C g'7 8:L _; ?.MMsBE)E E  Ectj5t|dd\}}t||d}|,t|dd\}}t||d}t |||n/t |dk7s|d j dk7r td ||}t|d d\}}t|d | td|D} t||| d} t| d| ddd|r` instead. Example: >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD) >>> def exp_reducer(x): ... return x.exp().sum(dim=1) >>> inputs = torch.rand(4, 4) >>> v = torch.ones(4, 4) >>> # xdoctest: +IGNORE_WANT("non-deterministic") >>> jvp(exp_reducer, inputs, v) (tensor([6.3090, 4.6742, 7.9114, 8.2106]), tensor([6.3090, 4.6742, 7.9114, 8.2106])) >>> jvp(exp_reducer, inputs, v, create_graph=True) (tensor([6.3090, 4.6742, 7.9114, 8.2106], grad_fn=), tensor([6.3090, 4.6742, 7.9114, 8.2106], grad_fn=)) >>> def adder(x, y): ... return 2 * x + 3 * y >>> inputs = (torch.rand(2), torch.rand(2)) >>> v = (torch.ones(2), torch.ones(2)) >>> jvp(adder, inputs, v) (tensor([2.2399, 2.5005]), tensor([5., 5.])) r5rTrpNrDFrrrThe vector v can only be None if the input to the user-provided function is a single Tensor with a single element.rqrMrrc3JK|]}tj|dywT)r.Nrri)r'r^s rr(zjvp..s% :=E  S 5 5 !#rsrNrc)rrtr"r8rKr)rurArTrr`rnr>r,) rxr5rDr6rSryr{rMrzr[rNr|rs rrrfsz    H"+FHe"D!&|PTU =QU+DAq %PA 6? 36{a6!9#5#5#71#<"- -$-      X%\1<H!7FL,WC  X X " q| Xw lS 6G C .C g'7 8:L ; _HH@ X XsCE7F7FF tensors. tensor_numelsreturnc"t|t|k(sJt|dkDsJt|tfdt||D}d}t||D]*\}}|j |j d||z},|S)Nrc3HK|]\}}|j|ywr ) new_zeros)r'tensor tensor_numel total_numels rr(z0_construct_standard_basis_for..s* FL l3s"r)r)sumrrBdiagonalfill_)rrchunksdiag_start_idxchunknumelrs @r_construct_standard_basis_forrs* w<3}- -- - w.s?uU[[]?r<ctj5tdt |D}t  |d\}} j |g}g}|D]c}tj |\}}|j |||j |@|j tj|e j |t|cdddS#1swYyxYw)Nc3lK|],\}}tj||j|.ywr )fwAD make_dualr1)r'rtangents rr(z'_jacfwd..jvp..s0$&wNN5'//%*@A$s24rM) r dual_levelrrBr"r0 unpack_dualrri) tangents dual_inputs_is_outputs_tuple dual_outputsjv primal_outsdual_outprimalrrxr5 output_infos rrz_jacfwd..jvp s" !#$*-fh*?$ 3<+& 3/!<""#45 ,&H C "L(') $ #L$6$6|$6$KV T K W-8CKK,Oq#((9K,OQ,O,W,W5hnn5w}}5-))//0IJ K " ( ()A B C" !$4o#F  " 5  rc  |dvsJd|dk(rr tdt|||Stj5t dd\}t d|}t |d d\}}t |d | |r|r td td |D t| } td|D fd} | | } g} t| D]v\} }g}t| j d|D]>\}}|j|j|jz}|j|@| j|xtt| }t|}t!|||fcdddSd}t#|D]\}tdt%t'D}t%j)D]}t+j-d|fd}t#t||D]x\}\}}}|3|rr|j.sd|d}t||j|?|rd|d|d}t||jtj0|z|tfdt#|Dfz } t|}t!|||fcdddS#1swYyxYw)aCompute the Jacobian of a given function. Args: func (function): a Python function that takes Tensor inputs and returns a tuple of Tensors or a Tensor. inputs (tuple of Tensors or Tensor): inputs to the function ``func``. create_graph (bool, optional): If ``True``, the Jacobian will be computed in a differentiable manner. Note that when ``strict`` is ``False``, the result can not require gradients or be disconnected from the inputs. Defaults to ``False``. strict (bool, optional): If ``True``, an error will be raised when we detect that there exists an input such that all the outputs are independent of it. If ``False``, we return a Tensor of zeros as the jacobian for said inputs, which is the expected mathematical value. Defaults to ``False``. vectorize (bool, optional): This feature is experimental. Please consider using :func:`torch.func.jacrev` or :func:`torch.func.jacfwd` instead if you are looking for something less experimental and more performant. When computing the jacobian, usually we invoke ``autograd.grad`` once per row of the jacobian. If this flag is ``True``, we perform only a single ``autograd.grad`` call with ``batched_grad=True`` which uses the vmap prototype feature. Though this should lead to performance improvements in many cases, because this feature is still experimental, there may be performance cliffs. See :func:`torch.autograd.grad`'s ``batched_grad`` parameter for more information. strategy (str, optional): Set to ``"forward-mode"`` or ``"reverse-mode"`` to determine whether the Jacobian will be computed with forward or reverse mode AD. Currently, ``"forward-mode"`` requires ``vectorized=True``. Defaults to ``"reverse-mode"``. If ``func`` has more outputs than inputs, ``"forward-mode"`` tends to be more performant. Otherwise, prefer to use ``"reverse-mode"``. Returns: Jacobian (Tensor or nested tuple of Tensors): if there is a single input and output, this will be a single Tensor containing the Jacobian for the linearized inputs and output. If one of the two is a tuple, then the Jacobian will be a tuple of Tensors. If both of them are tuples, then the Jacobian will be a tuple of tuple of Tensors where ``Jacobian[i][j]`` will contain the Jacobian of the ``i``\th output and ``j``\th input and will have as size the concatenation of the sizes of the corresponding output and the corresponding input and will have same dtype and device as the corresponding input. If strategy is ``forward-mode``, the dtype will be that of the output; otherwise, the input. Example: >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD) >>> def exp_reducer(x): ... return x.exp().sum(dim=1) >>> inputs = torch.rand(2, 2) >>> # xdoctest: +IGNORE_WANT("non-deterministic") >>> jacobian(exp_reducer, inputs) tensor([[[1.4917, 2.4352], [0.0000, 0.0000]], [[0.0000, 0.0000], [2.4369, 2.3799]]]) >>> jacobian(exp_reducer, inputs, create_graph=True) tensor([[[1.4917, 2.4352], [0.0000, 0.0000]], [[0.0000, 0.0000], [2.4369, 2.3799]]], grad_fn=) >>> def exp_adder(x, y): ... return 2 * x.exp() + 3 * y >>> inputs = (torch.rand(2), torch.rand(2)) >>> jacobian(exp_adder, inputs) (tensor([[2.8052, 0.0000], [0.0000, 3.3963]]), tensor([[3., 0.], [0., 3.]])) >>> def linear_model(x): ... W = torch.tensor([[2.0, -1.0], [0.0, 1.0]]) ... b = torch.tensor([1.0, 0.5]) ... return x @ W.T + b >>> x = torch.randn(4, 2, requires_grad=True) >>> jac = jacobian(linear_model, x, vectorize=True) >>> jac.shape torch.Size([4, 2, 4, 2])  forward-mode reverse-modezExpected strategy to be either "forward-mode" or "reverse-mode". Hint: If your function has more outputs than inputs, "forward-mode" tends to be more performant. Otherwise, prefer to use "reverse-mode".rztorch.autograd.functional.jacobian: `create_graph=True` and `strategy="forward-mode"` are not supported together (yet). Please either set `create_graph=False` or `strategy="reverse-mode"`.r5rTrprqrMrrztorch.autograd.functional.jacobian: `strict=True` and `vectorized=True` are not supported together. Please either set `strict=False` or `vectorize=False`.c3<K|]}|jywr rr'outputs rr(zjacobian..s!GV&,,.!Gr<c3>K|]}|jdyw)N)rrs rr(zjacobian..s J!3 Jc tt|d}t|D]L\}}| tj|j t f|jz||<Nt|S)NT)r6rX) rr`rrriexpandrrr) grad_outputvjel_idxvj_elr6 flat_outputsr5 output_numelss rrzjacobian..vjps"$#%1)- &/r]MFE( !&!1!1&.!A!H!H]+-v0D0DD"BvJ Ry rrrNr&c3 K|]}gywr r&)r'r{s rr(zjacobian..s4TAR4Ts r)rWr6rfrhrQz7 of the user-provided function is independent of input rPc3K|]O\}}tj|djj|jzQyw)rrN)rstackviewrC)r'rjac_i_elr5r^s rr(zjacobian..<sP+KKa055 VF^%8%8%::sAA)rrrrtr"r8rTrArrrBrrrr0r>r,rrr)rur`rr.ri)!rxr5r6rSrstrategyryrMrzr[rjacobians_of_flat_outputr jac_input_iinput_ijacobian_input_i_outputroutput_jrjacobian_output_inputrr jac_ijrrrrinp_elmsgrr^rs! `` @@@rrr>sz 7 7 3 7 >! %-  tVVY77    \Q"+FHj"I!&|PTU-$- * B#[[_Q')!%!- :Cr6*:B5F5Xuf(!l5;N;N!889s;F!F #/s"33 .!")!-88>x@/!/ #/s"33 (8(8(@A)B B< /8.>  HC( T%X|<!(-=,OPy\Q\Q\QsEK#EK##K,c t|dd\}}dvsJdfd fd}t||||}t|||fS)aCompute the Hessian of a given scalar function. Args: func (function): a Python function that takes Tensor inputs and returns a Tensor with a single element. inputs (tuple of Tensors or Tensor): inputs to the function ``func``. create_graph (bool, optional): If ``True``, the Hessian will be computed in a differentiable manner. Note that when ``strict`` is ``False``, the result can not require gradients or be disconnected from the inputs. Defaults to ``False``. strict (bool, optional): If ``True``, an error will be raised when we detect that there exists an input such that all the outputs are independent of it. If ``False``, we return a Tensor of zeros as the hessian for said inputs, which is the expected mathematical value. Defaults to ``False``. vectorize (bool, optional): This feature is experimental. Please consider using :func:`torch.func.hessian` instead if you are looking for something less experimental and more performant. When computing the hessian, usually we invoke ``autograd.grad`` once per row of the hessian. If this flag is ``True``, we use the vmap prototype feature as the backend to vectorize calls to ``autograd.grad`` so we only invoke it once instead of once per row. This should lead to performance improvements in many use cases, however, due to this feature being incomplete, there may be performance cliffs. Please use `torch._C._debug_only_display_vmap_fallback_warnings(True)` to show any performance warnings and file us issues if warnings exist for your use case. Defaults to ``False``. outer_jacobian_strategy (str, optional): The Hessian is computed by computing the Jacobian of a Jacobian. The inner Jacobian is always computed in reverse-mode AD. Setting strategy to ``"forward-mode"`` or ``"reverse-mode"`` determines whether the outer Jacobian will be computed with forward or reverse mode AD. Currently, computing the outer Jacobian in ``"forward-mode"`` requires ``vectorized=True``. Defaults to ``"reverse-mode"``. Returns: Hessian (Tensor or a tuple of tuple of Tensors): if there is a single input, this will be a single Tensor containing the Hessian for the input. If it is a tuple, then the Hessian will be a tuple of tuples where ``Hessian[i][j]`` will contain the Hessian of the ``i``\th input and ``j``\th input with size the sum of the size of the ``i``\th input plus the size of the ``j``\th input. ``Hessian[i][j]`` will have the same dtype and device as the corresponding ``i``\th input. Example: >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD) >>> def pow_reducer(x): ... return x.pow(3).sum() >>> inputs = torch.rand(2, 2) >>> # xdoctest: +IGNORE_WANT("non-deterministic") >>> hessian(pow_reducer, inputs) tensor([[[[5.2265, 0.0000], [0.0000, 0.0000]], [[0.0000, 4.8221], [0.0000, 0.0000]]], [[[0.0000, 0.0000], [1.9456, 0.0000]], [[0.0000, 0.0000], [0.0000, 3.2550]]]]) >>> hessian(pow_reducer, inputs, create_graph=True) tensor([[[[5.2265, 0.0000], [0.0000, 0.0000]], [[0.0000, 4.8221], [0.0000, 0.0000]]], [[[0.0000, 0.0000], [1.9456, 0.0000]], [[0.0000, 0.0000], [0.0000, 3.2550]]]], grad_fn=) >>> def pow_adder_reducer(x, y): ... return (2 * x.pow(2) + 3 * y.pow(2)).sum() >>> inputs = (torch.rand(2), torch.rand(2)) >>> hessian(pow_adder_reducer, inputs) ((tensor([[4., 0.], [0., 4.]]), tensor([[0., 0.], [0., 0.]])), (tensor([[0., 0.], [0., 0.]]), tensor([[6., 0.], [0., 6.]]))) r5r rz@Expected strategy to be either "forward-mode" or "reverse-mode".c|}t|dd\}}t|d|st|tjs t d|j dk7r t d|jS)Nrqr rMrrz;The function given to hessian should return a single TensorrzTThe Tensor returned by the function given to hessian should contain a single element)r"rTrrrrArusqueeze)rr^ is_out_tuplet_outrxrSs rensure_single_output_functionz.hessian..ensure_single_output_functionsCj' 8)  e UIf= z#u||<M  <<>Q f {{}rcndk(rtd|D}t|d}t|d|S)Nrc3>K|]}|jdyw)TN)r4)r'ts rr(z,hessian..jac_func..s<1((..jac_funcs= "n 4<<>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD) >>> def pow_reducer(x): ... return x.pow(3).sum() >>> inputs = torch.rand(2, 2) >>> v = torch.ones(2, 2) >>> # xdoctest: +IGNORE_WANT("non-deterministic") >>> vhp(pow_reducer, inputs, v) (tensor(0.5591), tensor([[1.0689, 1.2431], [3.0989, 4.4456]])) >>> vhp(pow_reducer, inputs, v, create_graph=True) (tensor(0.5591, grad_fn=), tensor([[1.0689, 1.2431], [3.0989, 4.4456]], grad_fn=)) >>> def pow_adder_reducer(x, y): ... return (2 * x.pow(2) + 3 * y.pow(2)).sum() >>> inputs = (torch.rand(2), torch.rand(2)) >>> v = (torch.zeros(2), torch.ones(2)) >>> vhp(pow_adder_reducer, inputs, v) (tensor(4.8053), (tensor([0., 0.]), tensor([6., 6.]))) r5r TrpNrDFrrr~rqrMrrz7The function given to vhp should return a single TensorzPThe Tensor returned by the function given to vhp should contain a single elementrsrrd)rrtr"r8rKr)rurArTrrr`rvrwrnr>r,) rxr5rDr6rSryr{rMrzrrtr|r s rr r sn    ="+FHe"D!&|PTU =QU+DAq %PA 6? 36{a6!9#5#5#71#<"@-$- >> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD) >>> def pow_reducer(x): ... return x.pow(3).sum() >>> inputs = torch.rand(2, 2) >>> v = torch.ones(2, 2) >>> # xdoctest: +IGNORE_WANT("non-deterministic") >>> hvp(pow_reducer, inputs, v) (tensor(0.1448), tensor([[2.0239, 1.6456], [2.4988, 1.4310]])) >>> hvp(pow_reducer, inputs, v, create_graph=True) (tensor(0.1448, grad_fn=), tensor([[2.0239, 1.6456], [2.4988, 1.4310]], grad_fn=)) >>> def pow_adder_reducer(x, y): ... return (2 * x.pow(2) + 3 * y.pow(2)).sum() >>> inputs = (torch.rand(2), torch.rand(2)) >>> v = (torch.zeros(2), torch.ones(2)) >>> hvp(pow_adder_reducer, inputs, v) (tensor(2.3030), (tensor([0., 0.]), tensor([6., 6.]))) Note: This function is significantly slower than `vhp` due to backward mode AD constraints. If your functions is twice continuously differentiable, then hvp = vhp.t(). So if you know that your function satisfies this condition, you should use vhp instead that is much faster with the current implementation. r5r TrpNrDFrrr~rqrMrrz7The function given to hvp should return a single TensorzPThe Tensor returned by the function given to hvp should contain a single elementrsrc3JK|]}tj|dywrrr;s rr(zhvp..s Us))#TBBUrr re)rrtr"r8rKr)rurArTrrr`rrvrwrnr>r,)rxr5rDr6rSryr{rMrzrgrad_jacrdrtr|r s rr r 8s @    $<"+FHe"D!&|PTU =QU+DAq %PA 6? 36{a6!9#5#5#71#<"@-$- rKrTr`rnrrrrintrrrr r r r&rrrs '  > 6$. M*(\ # L4t\~nb  5<<$ % 6;CHo  5<<  FA N   HQ\ * GGTbJrr