L i4zddlZddlZddlZddlZddlmZddlmZmZm Z m Z m Z ddl Z ddl mZddlmZddl mZddlmZmZmZmZmZddlmZmZmZmZdd lmZdd l m!Z!dd l"m#Z#dd l$m%Z%dd l&m'Z'ddl(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/ddl0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8m9Z9ddl:m;Z;mZ>m?Z?ddl@mAZAddlBmCZCmDZDmEZEmFZFmGZGmHZHmIZImJZJmKZKmLZLmMZMmNZNmOZOmPZPmQZQmRZRmSZSmTZTmUZUmVZVmWZWmXZXmYZYmZZZm[Z[ddl\m]Z]m^Z^m_Z_m`Z`maZambZbmcZcerddldmeZeddlfmgZgddlhmiZie/jekZle+rddlmmnZnmoZogdZpe?jde?jde?jde?jde?jde?jde?jde?jde?jd i ZzeGd!d"e)Z{eGd#d$e)Z|eGd%d&e)Z}eGd'd(e)Z~e{Ze{Ze{Ze|Ze|Ze|Ze}Ze}Ze~Ze~Ze eefZe eefZe eefZe eefZe eefZe e{e|fZe e}e~fZe eefZGd)d*eAZd+Zd-d,Zy).N) dataclass) TYPE_CHECKINGAnyCallableOptionalUnion)version)nn)Cache DynamicCacheEncoderDecoderCacheQuantizedCache StaticCache)check_python_requirementsget_cached_module_fileget_class_in_moduleresolve_trust_remote_code)is_deepspeed_zero3_enabled)is_fsdp_managed_module)create_masks_for_generate)isin_mps_friendly)ExtensionsTrie) ModelOutputTransformersKwargsis_accelerate_availableis_hqq_availableis_optimum_quanto_availableis_torchdynamo_exportinglogging) AssistantVocabTranslatorCacheAssistedCandidateGenerator-AssistedCandidateGeneratorDifferentTokenizersCandidateGeneratorEarlyExitCandidateGeneratorPromptLookupCandidateGenerator%UniversalSpeculativeDecodingGenerator_prepare_attention_mask_prepare_token_type_ids) ALL_STATIC_CACHE_IMPLEMENTATIONS'DEPRECATED_STATIC_CACHE_IMPLEMENTATIONSSTATIC_CACHE_IMPLEMENTATIONSGenerationConfigGenerationMode)ContinuousMixin)#EncoderNoRepeatNGramLogitsProcessor'EncoderRepetitionPenaltyLogitsProcessorEpsilonLogitsWarperEtaLogitsWarperExponentialDecayLengthPenaltyForcedBOSTokenLogitsProcessorForcedEOSTokenLogitsProcessorInfNanRemoveLogitsProcessorLogitNormalizationLogitsProcessorListMinLengthLogitsProcessor!MinNewTokensLengthLogitsProcessorMinPLogitsWarperNoBadWordsLogitsProcessorNoRepeatNGramLogitsProcessor PrefixConstrainedLogitsProcessor RepetitionPenaltyLogitsProcessorSequenceBiasLogitsProcessor$SuppressTokensAtBeginLogitsProcessorSuppressTokensLogitsProcessorTemperatureLogitsWarperTopKLogitsWarperTopPLogitsWarperTypicalLogitsWarper.UnbatchedClassifierFreeGuidanceLogitsProcessor)ConfidenceCriteriaEosTokenCriteriaMaxLengthCriteriaMaxTimeCriteriaStoppingCriteriaStoppingCriteriaListStopStringCriteria)PreTrainedModel)PreTrainedTokenizerBase) BaseStreamer)AlignDevicesHookadd_hook_to_module)past_key_values cache_paramsstatememspast_buckets_states_sample _beam_search_assisted_decodingztransformers-community/dolaz)transformers-community/contrastive-searchz(transformers-community/group-beam-searchz.transformers-community/constrained-beam-searchceZdZUdZej ed<dZee ejed<dZ ee ejed<dZ ee e ejed<dZ ee e ejed<dZeeed<y) GenerateDecoderOnlyOutputa\ Outputs of decoder-only generation models, when using non-beam methods. Args: sequences (`torch.LongTensor` of shape `(batch_size, sequence_length)`): The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter if all batches finished early due to the `eos_token_id`. scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True`): Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax) at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for each generated token), with each tensor of shape `(batch_size, config.vocab_size)`. logits (`tuple(torch.FloatTensor)` *optional*, returned when `output_logits=True`): Unprocessed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax) at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for each generated token), with each tensor of shape `(batch_size, config.vocab_size)`. attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True`): Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`. hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True`): Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of `torch.FloatTensor` of shape `(batch_size, generated_length, hidden_size)`. past_key_values (`Cache`, *optional*, returned when `use_cache=True`): Returns the model cache, used to speed up decoding. Different models have a different cache format, check the model's documentation. Usually, a [`~cache_utils.Cache`] instance. sequencesNscoreslogits attentions hidden_statesrV)__name__ __module__ __qualname____doc__torch LongTensor__annotations__rartuple FloatTensorrbrcrdrVr c/mnt/ssd/data/python-lab/Trading/venv/lib/python3.12/site-packages/transformers/generation/utils.pyr_r_s415FHU5,,- .515FHU5,,- .5<@JuU%6%6789@?CM8E%(9(9":;<C'+OXe_+ror_ceZdZUdZej ed<dZee ejed<dZ ee ejed<dZ ee ejed<dZ ee ejed<dZee e ejed<dZee e ejed <dZee e ejed <dZeeed <y) GenerateEncoderDecoderOutputa Outputs of encoder-decoder generation models, when using non-beam methods. Args: sequences (`torch.LongTensor` of shape `(batch_size*num_return_sequences, sequence_length)`): The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter if all batches finished early due to the `eos_token_id`. scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True`): Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax) at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for each generated token), with each tensor of shape `(batch_size, config.vocab_size)`. logits (`tuple(torch.FloatTensor)` *optional*, returned when `output_logits=True`): Unprocessed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax) at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for each generated token), with each tensor of shape `(batch_size, config.vocab_size)`. encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True`): Tuple of `torch.FloatTensor` (one for each layer of the decoder) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True`): Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. decoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True`): Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`. cross_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True`): Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`. decoder_hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True`): Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of `torch.FloatTensor` of shape `(batch_size, generated_length, hidden_size)`. past_key_values (`Cache`, *optional*, returned when `use_cache=True`): Returns the model cache, used to speed up decoding. Different models have a different cache format, check the model's documentation. Usually, a [`~cache_utils.Cache`] instance. r`Nrarbencoder_attentionsencoder_hidden_statesdecoder_attentionscross_attentionsdecoder_hidden_statesrV)rerfrgrhrirjrkrarrlrmrbrsrtrurvrwrVr rnrorprrrrs!F15FHU5,,- .515FHU5,,- .5=Au'8'8!9:A@D8E%*;*;$<=DDHuU->->'?!@AHBFhuU5+<+<%=>?FGK8E%0A0A*B$CDK'+OXe_+rorrcXeZdZUdZej ed<dZeejed<dZ ee ejed<dZ ee ejed<dZ eej ed<dZee e ejed<dZee e ejed <dZeeed <y) GenerateBeamDecoderOnlyOutputa Outputs of decoder-only generation models, when using beam methods. Args: sequences (`torch.LongTensor` of shape `(batch_size*num_return_sequences, sequence_length)`): The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter if all batches finished early due to the `eos_token_id`. sequences_scores (`torch.FloatTensor` of shape `(batch_size*num_return_sequences)`, *optional*, returned when `output_scores=True`): Final beam scores of the generated `sequences`. scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True`): Beam transition scores for each vocabulary token at each generation step. Beam transition scores consisting of log probabilities of tokens conditioned on log softmax of previously generated tokens in this beam. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for each generated token), with each tensor of shape `(batch_size*num_beams, config.vocab_size)`. logits (`tuple(torch.FloatTensor)` *optional*, returned when `output_logits=True`): Unprocessed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax) at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for each generated token), with each tensor of shape `(batch_size*num_beams, config.vocab_size)`. beam_indices (`torch.LongTensor`, *optional*, returned when `output_scores=True`): Beam indices of generated token id at each generation step. `torch.LongTensor` of shape `(batch_size*num_return_sequences, sequence_length)`. attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True`): Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of `torch.FloatTensor` of shape `(batch_size*num_beams, num_heads, generated_length, sequence_length)`. hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True`): Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of `torch.FloatTensor` of shape `(batch_size*num_beams*num_return_sequences, generated_length, hidden_size)`. past_key_values (`Cache`, *optional*, returned when `use_cache=True`): Returns the model cache, used to speed up decoding. Different models have a different cache format, check the model's documentation. Usually, a [`~cache_utils.Cache`] instance. r`Nsequences_scoresrarb beam_indicesrcrdrV)rerfrgrhrirjrkrzrrmrarlrbr{rcrdrVr rnrorpryrys@48hu001815FHU5,,- .515FHU5,,- .5/3L(5++,3<@JuU%6%6789@?CM8E%(9(9":;<C'+OXe_+roryceZdZUdZej ed<dZeejed<dZ ee ejed<dZ ee ejed<dZ eej ed<dZee ejed<dZee ejed <dZee e ejed <dZee e ejed <dZee e ejed <dZeeed <y) GenerateBeamEncoderDecoderOutputa Outputs of encoder-decoder generation models, when using beam methods. Args: sequences (`torch.LongTensor` of shape `(batch_size*num_return_sequences, sequence_length)`): The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter if all batches finished early due to the `eos_token_id`. sequences_scores (`torch.FloatTensor` of shape `(batch_size*num_return_sequences)`, *optional*, returned when `output_scores=True`): Final beam scores of the generated `sequences`. scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True`): Beam transition scores for each vocabulary token at each generation step. Beam transition scores consisting of log probabilities of tokens conditioned on log softmax of previously generated tokens in this beam. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for each generated token), with each tensor of shape `(batch_size*num_beams, config.vocab_size)`. logits (`tuple(torch.FloatTensor)` *optional*, returned when `output_logits=True`): Unprocessed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax) at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for each generated token), with each tensor of shape `(batch_size*num_beams, config.vocab_size)`. beam_indices (`torch.LongTensor`, *optional*, returned when `output_scores=True`): Beam indices of generated token id at each generation step. `torch.LongTensor` of shape `(batch_size*num_return_sequences, sequence_length)`. encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True`): Tuple of `torch.FloatTensor` (one for each layer of the decoder) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True`): Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape `(batch_size*num_beams*num_return_sequences, sequence_length, hidden_size)`. decoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True`): Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of `torch.FloatTensor` of shape `(batch_size*num_beams*num_return_sequences, num_heads, generated_length, sequence_length)`. cross_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True`): Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`. decoder_hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True`): Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of `torch.FloatTensor` of shape `(batch_size*num_beams*num_return_sequences, generated_length, hidden_size)`. past_key_values (`Cache`, *optional*, returned when `use_cache=True`): Returns the model cache, used to speed up decoding. Different models have a different cache format, check the model's documentation. Usually, a [`~cache_utils.Cache`] instance. r`Nrzrarbr{rsrtrurvrwrV)rerfrgrhrirjrkrzrrmrarlrbr{rsrtrurvrwrVr rnrorpr}r}s(T48hu001815FHU5,,- .515FHU5,,- .5/3L(5++,3=Au'8'8!9:A@D8E%*;*;$<=DDHuU->->'?!@AHBFhuU5+<+<%=>?FGK8E%0A0A*B$CDK'+OXe_+ror}c$eZdZdZ dzdeeeejfdee de fdZ de jdee jd ee jdee je jffd Zde jdee jd ee jdee je jffd Z d{de jd eed ee jdee jd ee jf dZ d|dee j(dee j(deeee j(fdee j(eeeee j(fffdZ d|dee j(dee j(deeee j(fde jfdZde j(dedeeefde jfdZde j(deededeeeffdZ d}dededeee j(fde j(dee j:dee jeee j(fff dZe d~dede dee jdee jeeefffdZ dd e!deeefde d!edeeeff d"Z" d|dede jde j(d#e#deeefd$ed%d&ed'd(ed'de$fd)Z% dded*eed+ee jd,ee ee j(ge&efd#ee#deedeeeefd-ee j(d.ee j(de#fd/Z' d}ded0ee(d1ed'de(fd2Z)d3ee#e(fd4ee#e(fdee#e(ffd5Z* dd6e j(d7ee j(d8ee j(d9e de j(f d:Z+d;Z,deeeffd<Z-d=Z.d>Z/ d}deed?ee d@edeeeffdAZ0dBZ1dCededDedefdEZ2e3de fdFZ4dededGe5dedHede f dIZ6de fdJZ7 dzdedKee deee j:effdLZ8deeefdede fdMZ9 d}dGe5de dNeedeefdOZ:deeeffdPZ;e jx ddee j(deed#ee#d0ee(d,ee ee j(ge&efdQee d$ed%dRedSd-ee j(d.ee j(d?ee dNeeee fdee=e jffdTZ>dUe dQe de j:de fdVZ? d}de jd1ed'de jfdWZ@ dde jd#e#d0e(dedQe dRedSdeeAe jffdXZBedYe j(de j(fdZZCedYe j(ded[ede j(fd\ZDedYe j(d8e j(de j(fd]ZEed^e j(d_e j(d`e j(dae j(dbedceddedeee efdfeFfdgZGed^e j(dae j(dhe j(deee effdiZHdje j(dke j(dle j(dbeddedme dned[edoededee j(e j(e j(ffdpZIdqe j(dre j(dse j(dhe j(d[edee j(e j(e j(ff dtZJd6e j(dre j(d`e j(dqe j(d8e j(dse j(d^e j(dae j(dhe j(due j(d[edbeddedfeFdeee efdee j(e j(e j(e j(ff dvZK dde jd#e#d0e(dedQe deeLe jff dwZM dde jd#e#d0e(dedQe dRedSdee jd$ed%d(ed'd1ed'deeAe jffdxZNde jdefdyZOy)GenerationMixina A class containing all functions for auto-regressive text generation, to be used as a mixin in model classes. Inheriting from this class causes the model to have special generation-related behavior, such as loading a `GenerationConfig` at initialization time or ensuring `generate`-related tests are run in `transformers` CI. A model class should inherit from `GenerationMixin` to enable calling methods like `generate`, or when it has defined a custom `generate` method that relies on `GenerationMixin`, directly or indirectly, which approximately shares the same interface to public methods like `generate`. Three examples: - `LlamaForCausalLM` should inherit from `GenerationMixin` to enable calling `generate` and other public methods in the mixin; - `BlipForQuestionAnswering` has a custom `generate` method that approximately shares the same interface as `GenerationMixin.generate` (it has a few extra arguments, and the same output). That function also calls `GenerationMixin.generate` indirectly, through an inner model. As such, `BlipForQuestionAnswering` should inherit from `GenerationMixin` to benefit from all generation-related automation in our codebase; - `BarkModel` has a custom `generate` method and one of its inner models calls `GenerationMixin.generate`. However, its `generate` does not share the same interface as `GenerationMixin.generate`. In this case, `BarkModel` should NOT inherit from `GenerationMixin`, as it breaks the `generate` interface. The class exposes [`~generation.GenerationMixin.generate`], which can be used for: - *greedy decoding* if `num_beams=1` and `do_sample=False` - *multinomial sampling* if `num_beams=1` and `do_sample=True` - *beam-search decoding* if `num_beams>1` and `do_sample=False` - *beam-search multinomial sampling* if `num_beams>1` and `do_sample=True` - *assisted decoding* if `assistant_model` or `prompt_lookup_num_tokens` is passed to `.generate()` To learn more about decoding strategies refer to the [text generation strategies guide](../generation_strategies). Npretrained_model_name_or_pathtrust_remote_codereturnc t|fddi|}tjj |}d|d}t |||| |t |fdd i|td |}|S#t$rtd|dwxYw) at Loads and returns a custom generate function, given a model repo. Args: pretrained_model_name_or_path (`str` or `os.PathLike`): Can be either: - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co. - A path to a *directory* containing model weights saved using [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`. trust_remote_code (`bool`, *optional*): Whether or not to allow for custom models defined on the Hub in their own modeling files. This option should only be set to `True` for repositories you trust and in which you have read the code, as it will execute code present on the Hub on your local machine. **kwargs: Additional keyword arguments for remote code loading. Raises: OSError: If `pretrained_model_name_or_path` does not contain a `custom_generate` subdirectory. Returns: A callable that can be used to generate text. module_filezcustom_generate/generate.py`zw` does not contain a `custom_generate` subdirectory with a `generate.py` file, can't load the custom generate function.zThe repository `zS` contains custom generation code that will override the default `generate` method.)has_local_codehas_remote_code error_messagerequirements_filez custom_generate/requirements.txtgenerate)rOSErrorospathexistsrrr)selfrrkwargsmodule is_local_codercustom_generate_functions rpload_custom_generatez$GenerationMixin.load_custom_generates< +-;X\bF'DE <=>- -  "  )( --'   " ) =_ ci $7z6#J ''3 123OO  s A""A; input_ids inputs_embedscache_positionc^tr|j|||S|/|jddk(r|dd|jd df}||fS||d|jdk\r|dd|jd df}||fS|jd|jdk7r |dd|f}||fS)a Generic cache-dependent input preparation The code is put in a separate function to allow granular unit testing as it needs a different implementation to be exportable. If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens - Exception 1: when passing input_embeds, input_ids may be missing entries - Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here - Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case. - Exception 4: If input_embeds are passed then slice it through `cache_position`, to keep only the unprocessed tokens and generate the first token for each sequence. Later use the generated Input ids for continuation. The current implementation does not rely on ``self`` and could be a class method. It is left as a standard method to be easily rewritten. Nr!r)r,_cache_dependant_input_preparation_exportingshape)rrrrs rp"_cache_dependant_input_preparationz2GenerationMixin._cache_dependant_input_preparations* $ %DDYP]_mn n  $);q)@)!n.B.B1.E-E-G*GHMi''  %r"iooa&88!!n&:&:1&=%=%?"?@Ii''__Q >#7#7#: :!!^"34Ii''roc| |dd|f}||fSdddtj|jddk(fdfd|||g\}}||fS) z This method implements method ``_cache_dependant_input_preparation`` with :func:`torch.cond` to make it exportable with :func:`torch.export.export`. The code is put in a separate function to allow granular unit testing. NcP|dd|jd dfjSNrrclone)rrs rpbranch_1zNGenerationMixin._cache_dependant_input_preparation_exporting..branch_1s,$Q)=)=a)@(@(B%BCIIKKrocP|dd|jd dfjSrrrrs rpbranch_2zNGenerationMixin._cache_dependant_input_preparation_exporting..branch_2s, ^%9%9!%<$<$>!>?EEGGroc0|dd|fjSNrrs rpbranch_3zNGenerationMixin._cache_dependant_input_preparation_exporting..branch_3s N!2399;;ror!rc6|||jfSrr)rrrrs rpzNGenerationMixin._cache_dependant_input_preparation_exporting..s ?!)Froch|tj|d|jdk\fd||gfS)Nrr!cttj|jd|jdk7d||gS)Nr!rc"|jSrrrs rprzrGenerationMixin._cache_dependant_input_preparation_exporting......s yGXroricondr)rrrs rprz`GenerationMixin._cache_dependant_input_preparation_exporting....s> % $-OOA$6.:N:Nq:Q$Q$,%X%.$? !"ror)rrrrrs rprzNGenerationMixin._cache_dependant_input_preparation_exporting.. sB% *2.)//!2DD$'7 Fror)rrrrrrrs @@@rprz:5( $M98i''rorVattention_maskc i}||d<|||d<|j|||\}}|jjrdnd}|jjsQ|&t||jdk(r d||<||d<nL|j t j||<d|d<n#|j t j||<|jjr|nd} |jjr|jd dn|}|jjrd nd } |jjrd nd } ||j| ~| ttj|jjjvr<|j!j#d dz } | j%|dk(d| || <dD]~} |j| }||a|jd|djdn||jd}|dd| df}|j t j}||| <t'|t(rY|j*rL|I|j,dk(r9|jjs|d|dj\}}}n||jdd\}}t/||j0|}t3|dr|j5nd}t/|dd}|| t/|dd}|n|jd}|j| } t/|dt6}||jt j8||f|j:|||| |}n2||||j=|j:|||j|}|||| <| | |d <|j?D]\}}||vs |||<|jdd|S)a_ Prepare the model inputs for generation. Notable steps include selecting the correct input key and cloning when appropriate, creating position_ids from the attention_mask when missing, slicing inputs and converting 2D attention masks to 4D for compilable caches, and finally forwarding all additional keyword arguments unchanged to the model's forward pass. See the forward pass in the model documentation for expected arguments (different models might have different requirements for e.g. `past_key_values`). This function should work as is for most LLMs. rNrVdecoder_input_idsrr!r) memory_formatdecoder_attention_maskrdecoder_position_ids position_idsrr)rtoken_type_idsrr get_decoder5_prepare_4d_causal_attention_mask_with_cache_positionrrdtype)config input_embedsrrrVrr)sequence_length target_lengthrr batch_sizerrVlabels) rris_encoder_decoderlenrrricontiguous_formatpopgetsetinspect signatureforward parameterskeyslongcumsum masked_fill_ isinstancer is_compileablendimgetattrbase_model_prefixhasattrrremptyrget_max_cache_shapeitems)rrrVrrrr model_inputs input_ids_keyencoder_attention_maskattention_mask_keyposition_ids_keyrmodel_input_name model_inputcurrent_input_lengthrr_ base_modeldecodercausal_mask_creation_functionrkeyvalues rpprepare_inputs_for_generationz-GenerationMixin.prepare_inputs_for_generation"s[& )7 %&  &.=L* +(,'N'N=.( $M9 04{{/M/M+S^ {{--(S-@MDWDWXYDZ-Z.2 ]+0= _-/8ooELcLco.d ]+04 _-*3//H_H_/*`L '48;;3Q3QW[:>++:X:XFJJ/ 6^l :>9W9W5]m59[[5S5S1Yg  & +,4 C(9(9$,,(G(R(R(W(W(Y$ZZ)..077;a?L  % %n&91 ='3F# $![ =  **%56K&".(++O<H%_5;;A>)-8>>qA) #.a2F1F1G.G"HK"-"3"3%BYBY"3"ZK1< -. =  ...*##q(;;11l?6S6_1=o1N1T1T. OQ.:=.I.O.OPRQR.S+ O!t'='=tDJ29*m2Tj,,.Z^G,3SUY- )-49L07TVZ1- -4!-!1!12B!C+//0@A 07>Y[t0u-!>;;!&j/-JRVR\R\!]#1#1$3!-#1 ""?"$3"1"E"E"G**#1);;$3 "  %/=L+ , ! --CL) *!,,. *JC,&$) S! * 4(roinputs bos_token_id model_kwargsc h|jjrFt|dr:|jj|jk7r|jj}n |j}|j Dcic]\}}|||k7s||}}}|j |d}||td|d|d|d|d ||}|dk(rd |vr|d |j d n|jjsd ttj|jjjv}|s#td |jjd |j!||| |d<|d d }}n| td |d d }}|j!|||}|||fScc}}w)zT This function extracts the model-specific `inputs` for generation. encoderNz `inputs`: z` were passed alongside z0 which is not allowed. Make sure to either pass z or z=...rrzAYou passed `inputs_embeds` to `.generate()`, but the model class z doesn't have its forwarding implemented. See the GPT2 implementation for an example (https://github.com/huggingface/transformers/pull/21405), and feel free to open a PR with it!)rzMYou passed `inputs_embeds` and `input_ids` to `.generate()`. Please pick one.)rrrrmain_input_namerr ValueErrorrrrrrr __class__re*_maybe_initialize_input_ids_for_generation) rrrr input_namekv inputs_kwarghas_inputs_embeds_forwardings rp_prepare_model_inputsz%GenerationMixin._prepare_model_inputss KK * *i( ,,0D0DD55J--J)5););)=bARSWaRa1b b$'' D9  #(:VH$#%%d&H&HITTYY[C0,4$[\`\j\j\s\s[tuxx-1,[,[L|-\- [)&2/%BO %$%tuu%1/%BO @@Wcdz<//[cs <F. F.cp||S|jd}|jjrR|P|jj dd}t j |t j|jdzSd}|jD]-}t|t js|jd}nd|vr2t j |dft j|jS| td t j |dft j|j|zS) z3Initializes input ids for generation, if necessary.Nencoder_outputsrrdeviceir!rrzB`bos_token_id` has to be defined when no `input_ids` are provided.)rrrlast_hidden_statesizerionesrrvaluesrTensorrr)rrrrrrrrs rprz:GenerationMixin._maybe_initialize_input_ids_for_generations   M&**+<= ;; ) )o.I#55::zQGenerationMixin._prepare_encoder_decoder_kwargs_for_generation..NsI!x**1-Irroutput_attentionsoutput_hidden_states return_dictrrn) get_encoderrrrrUrTrr rrrrrr#r$r) rrrrrrirrelevant_prefixr rencoder_kwargsencoder_signatureencoder_accepts_wildcards ` rp._prepare_encoder_decoder_kwargs_for_generationz>GenerationMixin._prepare_encoder_decoder_kwargs_for_generation8sy""$ 4 )w +26  /"7,7P7P &') s*!D9 D?,D?rdecoder_start_token_idrc|d|vr|jd}nd|vr|dk7r|jd}nd}| |j}|jdk(rC|jd|k7rt d|d|jd|j dd}n+t j|dft j| |z}||}||fSd |jjjvsI|jjd k(r5d |jjjjvr ||fS|jjd k(r ||fS|dddf|dddfk7jj!r\t j"||gd }d|vr?|d}t j"t j$|ddddf|fd }||d<||fS)zGPrepares `decoder_input_ids` for generation with encoder-decoder modelsNrrr!rz1`decoder_start_token_id` expected to have length z but got rrdonutzvision-encoder-decoderwhisperdimr)rrrrrviewrirrrrelowerr model_typerallitemcat ones_like)rrrrr,rrrs rp)_prepare_decoder_input_ids_for_generationz9GenerationMixin._prepare_decoder_input_ids_for_generationas(  #(;|(K , 0 01D E  L (-=-L , 0 0 =  $  >[[F ! & &! +%++A.*< G |S\]s]y]yz{]|\}~&<%@%@Q%G " J?%**VLOee #  $ 6 ,!,..%//557 7 KK " "&> >7dkkNaNaNlNlNrNrNtCt !,..[[ # #y 0 !,.. 1%)?1)EE J J L Q Q S % +ACT*U[] ^ '<7)56N)O&).__%;._expand_dict_for_generationsd% d++&s+7">##6 E*8*=*O*OP[ab*O*cN3'  d" !rorr0rzMIf `is_encoder_decoder` is True, make sure that `encoder_outputs` is defined.)r=rr)r:rrrr?s` rp_expand_inputs_for_generationz-GenerationMixin._expand_inputs_for_generations ! l* * "  !33KQ3GI2<@  12: !pqq.I,WhJi.jL* +,&&rooutputsnum_new_tokensctD] }||vs|dvrd}n|}t||||<nd|vr7|d}tj||dddfj dgd|d<|sDd|vr|d}tj||j |j ddfgd|d<nCd |vr?|d } tj| | j | j ddfgd|d <|jd d r|d dd|z|d <|S|jd } tj| ddz| d|zdz| j j| j} tj| | f|d <|S)N)rZrYrVrrr0rrr!rrTrr) ALL_CACHE_NAMESrrir7 unsqueezenew_onesrrrarangertor) rrArrrBpossible_cache_name cache_namerrrpast_positions new_positionss rp#_update_model_kwargs_for_generationz3GenerationMixin._update_model_kwargs_for_generations$3  "g-&*II!2J!4J+27!?16#^%<%R>RST>UWX=Y%Z[ac2 -. (<7)56N)O&9>+-C-L-LNdNjNjklNmopMq-rs: 56   K .-9:J-KBC-PSa-aL) * *--.>?N!LLr"Q&r(:^(Ka(OWeWkWkb&&' .3YY 7V-WL) *rologits_processorassistant_modelrQtarget_tokenizerrRassistant_tokenizerc td|||fD} |jt||||||} | S|jat |j |j|j xsd|j||jjj} | S| r|jdurctj|||jjj|d} d|j_t!|||||||||  } | S|jd urt#|||||||| } | St%d t'|jj(t+||||||} | S) zU Returns the candidate generator to be used in `assisted_generation` c3$K|]}|du ywrrn)rrs rpr!z;GenerationMixin._get_candidate_generator..s"sQ1D="ssN)rrOrrrrNr )r num_output_tokensmax_matching_ngram_size max_lengthrN vocab_sizeT)rOassistant_prune_lm_head) rrOrrrrNrPrQatm_translatorF)rrOrrrrNrPrQz7Invalid value for `do_sample`: expected a boolean, got )r5assistant_early_exitr&prompt_lookup_num_tokensr'rrUrVrget_text_configrW do_sampler"get_translatorrrepetition_penaltyr(r$rtyperer#) rrrrrNrrOrPrQdifferent_tokenizerscandidate_generatorrYs rp_get_candidate_generatorz(GenerationMixin._get_candidate_generators #"s?L\^q:r"ss  1 1 ="=# $"3)+!1 # ~#"o 7 7 C"@.@@"3"L"L(9(Q(Q(VUV,77!1;;668CC # l#"]" **d2!>!M!M$'KK//1<<$3,0 "HL11D&K'$3&7!-"/%5%5(;#1 '#F#"1#,,5&S'$3&7!-"/%5%5(; '#.#"!MdSdSnSnNoNxNxMyz#=# /"3)+!1 # #"roinput_ids_seq_lengthencoder_input_idsprefix_allowed_tokens_fnnegative_prompt_idsnegative_prompt_attention_maskc t} |g}|jB|jdk7r3| jt|j||| |j|j %| jt |j |jh|jdk7rYt|jdk(r'| jt|j|ntjdt|j4|jdk7r%| jt|j |j 3|j d kDr$| jt#|j |j$g|j$d kDrXt|jdk(r&| jt'|j$|ntjd t|j(/| jt+|j(|j,|j.Mt1|d d@|j.d kDr1| jt3|j.|j,| |j4Nt1|d dA|j4d kDr2| jt7||j4|j,| |%| jt9||j:|j<$| jt?|j<|j@1| jtC|jD|j@| |jFdur| jtI|jJ0| jtM|jJ|j,||jN&| jtQ|jN| |jRA|} |dkDs |j<| n| dz} | jtU|jR| | |jW| |} |jXrQ|j:dkDrwt[|j,t\rt|j,dz} nFt[|j,t^j`r|j,jd dz} nd} nd} |jb3|jbdk7r$| jte|jb|jf5|jfd k7r&| jti|jf| |jj5|jjdkr&| jtm|jj| |jn&| jtq|jn| |jr5|jrdkr&| jtu|jr| |jv>d|jvcxkrdkr)nn&| jty|jv| |jz?d|jzcxkrdkr*nn'| jt}|jz| ||j~M| j|j~j|jjj||jdur| jt| S)z This class returns a [`LogitsProcessorList`] list object that contains all relevant [`LogitsProcessor`] instances used to modify the scores of the language model head. Nr!)unconditional_idsunconditional_attention_maskr sequence_bias?r )penaltyrezyPassing `encoder_repetition_penalty` requires some form of `input_ids` to be passed to `generate`, ignoring the argument.)rorz{Passing `encoder_no_repeat_ngram_size` requires some form of `input_ids` to be passed to `generate`, ignoring the argument.rrT)top_kmin_tokens_to_keep)top_prr)min_prr)massrr)epsilonrr)rwrrr)Fr:guidance_scaleappendrIrrmrBencoder_repetition_penaltyrrr2warningswarn UserWarningr_rAno_repeat_ngram_sizer?encoder_no_repeat_ngram_sizer1 bad_words_idsr>r min_lengthrr;min_new_tokensr<r@ num_beamsforced_bos_token_idr6forced_eos_token_idr7rVremove_invalid_valuesr8 exponential_decay_length_penaltyr5suppress_tokensrDbegin_suppress_tokensrC_merge_criteria_processor_listr]rlistrir temperaturerErqrFrsrGrtr= typical_prHepsilon_cutoffr3 eta_cutoffr4watermarking_configconstruct_processorrr\rWrenormalize_logitsr9) rrrdrerfrNrrrgrh processors begin_indexrrs rp_get_logits_processorz%GenerationMixin._get_logits_processorBs")*  #!   + + 7%44&91O/99    * * 6   9HYHgHgh i  8 8 D!<<C$**+q0!!; 1 L L*; 9  / / ;@Q@d@dhk@k   >GXGkGkl m  1 1 =BSBhBhklBl   :;L;a;ab c  : : F!>>B$**+q0!!7)FF) 9  * * 6   )%33%77   ( ( 4)+>EQ!,,q0   (%00%77!   , , 8)+>EQ!0014   1(%44%77!   $ /   0,%//   0 0 <   -%99   0 0 <   -%00%99!   2 2d :   9; <  = = I   -%FF%77(   , , 8   -%55!   2 2 >.K)1,0A0U0U0] 1_    4%;;! 88EUV   & &!**Q./AA4H),->-P-P)QTU)U& 1 C CU\\R):)L)L)R)RST)UXY)Y&)*&%&"!,,8=N=Z=Z^a=a!!"9:K:W:W"XY &&27H7N7NRS7S!!$+<+B+BWij!&&27H7N7NQT7T!!$+<+B+BWij!&&2!!$+<+B+BWij!**6;L;V;VY\;\!!'->-H-H]op!//;FWFfFf@lil@l!!' 1 @ @Ug !++7CBSB^B^1` at generate-time. normalize_logits (`bool`, *optional*, defaults to `False`): Whether to normalize the logits (which, for legacy reasons, may be unnormalized). Return: `torch.Tensor`: A `torch.Tensor` of shape `(batch_size*num_return_sequences, sequence_length)` containing the transition scores (logits) Examples: ```python >>> from transformers import GPT2Tokenizer, AutoModelForCausalLM >>> import numpy as np >>> tokenizer = GPT2Tokenizer.from_pretrained("gpt2") >>> model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2") >>> tokenizer.pad_token_id = tokenizer.eos_token_id >>> inputs = tokenizer(["Today is"], return_tensors="pt") >>> # Example 1: Print the scores for each token generated with Greedy Search >>> outputs = model.generate(**inputs, max_new_tokens=5, return_dict_in_generate=True, output_scores=True) >>> transition_scores = model.compute_transition_scores( ... outputs.sequences, outputs.scores, normalize_logits=True ... ) >>> # input_length is the length of the input prompt for decoder-only models, like the GPT family, and 1 for >>> # encoder-decoder models, like BART or T5. >>> input_length = 1 if model.config.is_encoder_decoder else inputs.input_ids.shape[1] >>> generated_tokens = outputs.sequences[:, input_length:] >>> for tok, score in zip(generated_tokens[0], transition_scores[0]): ... # | token | token string | log probability | probability ... print(f"| {tok:5d} | {tokenizer.decode(tok):8s} | {score.numpy():.3f} | {np.exp(score.numpy()):.2%}") | 262 | the | -1.414 | 24.33% | 1110 | day | -2.609 | 7.36% | 618 | when | -2.010 | 13.40% | 356 | we | -1.859 | 15.58% | 460 | can | -2.508 | 8.14% >>> # Example 2: Reconstruct the sequence scores from Beam Search >>> outputs = model.generate( ... **inputs, ... max_new_tokens=5, ... num_beams=4, ... num_return_sequences=4, ... return_dict_in_generate=True, ... output_scores=True, ... ) >>> transition_scores = model.compute_transition_scores( ... outputs.sequences, outputs.scores, outputs.beam_indices, normalize_logits=False ... ) >>> # If you sum the generated tokens' scores and apply the length penalty, you'll get the sequence scores. >>> # Tip 1: recomputing the scores is only guaranteed to match with `normalize_logits=False`. Depending on the >>> # use case, you might want to recompute it with `normalize_logits=True`. >>> # Tip 2: the output length does NOT include the input length >>> output_length = np.sum(transition_scores.numpy() < 0, axis=1) >>> length_penalty = model.generation_config.length_penalty >>> reconstructed_scores = transition_scores.sum(axis=1) / (output_length**length_penalty) >>> print(np.allclose(outputs.sequences_scores, reconstructed_scores)) True ```Nrrr!r0)rirGrr2rHrexpandrstackreshape transposerr\rWr functional log_softmaxrsummaxrgather) rr`rar{rbeam_indices_maskmax_beam_lengthbeam_sequence_indicescut_idxindicestransition_scoress rpcompute_transition_scoresz)GenerationMixin.compute_transition_scoresjsh   <<q (:;@@QGJJ9K[K[\L'..r3v;?LV$,,S["=GG1M ^^B (C(C(E(P(PRXR^R^_aRbcFXX((44V4CF^^B R(89F)1,05577<.s4 \`GDKK.'/:P:PRV2WW s7:zThe main model and the assistant don't have compatible encoder-dependent input shapes. Ensure you load the assistant with the correct encoder-decoder class, e.g. `AutoModelForSpeechSeq2Seq` for Whisper.zc(see https://huggingface.co/docs/transformers/en/generation_strategies#universal-assisted-decoding)rQz`assistant_tokenizer` is not required when the main and assistant models use the same tokenizer. Please omit `assistant_tokenizer` from `generate()` rz~The main and assistant models have different tokenizers. Please provide `tokenizer` and `assistant_tokenizer` to `generate()` )r/ BEAM_SEARCHrASSISTED_GENERATIONnum_return_sequences _is_statefulrrerrrdirr5r\rW) rgeneration_modergeneration_mode_kwargsattributes_to_checkr are_equal doc_referencerOs ` @rp_validate_generation_modez)GenerationMixin._validate_generation_modes n88 8ZKa=al  n@@ @ 559 /DDEQH  !YZ^ZhZhZqZqYrs 699:KL LO Y{{--o6L6L6_6_&f#8;O|jj>|_|j@di|}|jB}|jD}|jA|rd|ini|jA|rd|ini||fS)z Prepares the base generation config, then applies any generation configuration options from kwargs. This function handles retrocompatibility with respect to configuration files. FNra?You have modified the pretrained model configuration to control generation. This is a deprecated strategy to control generation and will be removed in v5. Please use and modify the model generation configuration (see https://huggingface.co/docs/transformers/generation_strategies#default-text-generation-configuration )Thybridz4.50.0rtransformers_versioncache_implementationrvzX`generation_config` default values have been modified to match model-specific defaults: z=. If this is not desired, please set these values explicitly.r#r$rn)#r_from_model_config_original_object_hashhashrr&_get_non_default_generation_parametersr.from_model_configr{r|r}rcopydeepcopyr parser base_version__dict__rrrsetattrrr]rrrr r r,updater#r$)rrrrusing_model_generation_confignew_generation_configmodel_base_versionmodified_values global_default_generation_configmodel_generation_configrmodel_gen_config_valueglobal_default_valuecustom_gen_config_valuerr#r$s rp_prepare_generation_configz*GenerationMixin._prepare_generation_configs,).%  $&&99**@@DI_I_D`` JJLMPQQ(8(J(J4;;(W%(D,B,BBMMB$ .CD* $ 6 6 ,0 ) !55A9=!6!MM*;<,")w}}T=S=S=h=h/i/v/v!w !T)"*/AW]]S[E\/\"$3C3E0*.*@*@'3J3S3S3Y3Y3[ P/C/~~c*c5K.K 449P9e9eiq9q +23SUXZ^+_(.56Gd.S+/3GG26JJ/E, 138NO P %00C727%/%-#o2F2J''r*++hj %119595K5K5X5X%2$119595K5K5X5X%2$119595K5K5X5X%2$;;C?C?U?U?l?l%<0(//9&9 -??0EEHY02CD_abNb35IJhjk ,..rocd|vr|d|Sd|vrY|jjsCtj|dddddftjj ddz }nd|vrY|jjrCtj|dddddftjj ddz }n8tj |tj|j ddz }d}|jd N|d }d}t|tr|ddjd }nt|d r|j}||d}||d<|S) zbCalculates `cache_position` for the pre-fill stage based on `input_ids` and optionally past lengthrNrrrr!decoder_inputs_embedsrrVr get_seq_length) rrrir8int64rrrrrlrrr)r seq_lengthrrr past_lengthcaches rp_get_initial_cache_positionz+GenerationMixin._get_initial_cache_positionse | + =M0N0Z  l *4;;3Q3Q"__\/-J1aQR7-S[`[f[fgnnopqtuuN $ 49W9W -D EaAg NV[VaVabiijklopp #ZZ %++fU\\]^_bccN   - . : !23EK%'#Ahqk//2  01#224 +KL9N)7 %&ror max_cache_lencJ|jjxs|jddu}d|v}t|dr$|r|jj n |j}t|d xs1j |k7xs |j|k7xs|j|k}|rFt|dr:|xs6|jjj|ddjdk7}|r|jjd||d } td i| |_|rW|jjd|ddjd|d } t|jtd i| |_|jS|jj|jS) z Sets a cache for `generate`, that will persist across calls. A new cache will only be initialized a new `generate` call requires a larger cache or uses a different batch size. Returns the resulting cache object. rN offloaded_cacherr!Tr)rr offloadingrn)rrrrr self_attention_cacher max_batch_sizercross_attention_cacherr\rrreset) rrrrrrequires_cross_attention_cache offload_cachecache_to_checkneed_new_cacheself_attention_cache_kwargscross_attention_cache_kwargss rp _get_cachezGenerationMixin._get_cache0s KK * * ]l.>.>?P.QY].] '$';; 4 "A_T[[==eiepepNh' ' <((M9 <,, : <++m;  *gdH.Er;;44BBlSdFefgFhFnFnopFqq  ++55d5C!.++ ' &D(CDDK-"kk99$9G%12C%DQ%G%M%Ma%P"/0, 2$++{?jMi?jk {{ KK   {{rocJj xrtfddDS)a Return `True` if current model can use a `DynamicCache` instance when initializing the `past_key_values`. This adds exception for some models like `Mamba` models which use their own caches and do not need to initialize the Cache in advance in order to save memory (because no back and forth `to_legacy_cache` and `from_legacy_cache` will be performed for mamba-based models). c3VK|] }|jjv"ywr)rer3)rspecial_model_nameclss rpr!zBGenerationMixin._supports_default_dynamic_cache..gs+ , " cll&8&8&: : , s&))reformerminimaxxlnetlfm2zlfm2-vl)rr5)rs`rp_supports_default_dynamic_cachez/GenerationMixin._supports_default_dynamic_cache^s1###   , ' , ) rormax_cache_lengthcltfddD}|sdnd}jjxs|jddu}|j|} | |jt d|dt | trTjrDtjd |stj| ntj| ||<y|jd uryjs0|j#tjd |jd y|tj k(r6|j*tjd |jdd|_|tj tj"fvs|jdk(ri} ndjj%di} |j|jt&vr|jt(vr*tjd|jdt*dj-|jt/|j0|j2|z||||<n|jdk(rÉjjsjs t d|j4 |j4ni} d| vrjj%| d<| j7dd} | dk(rt9s t;d| dk(rt=s t;dt?d!d| i| ||<nM|jdk(rtd!i| ddi||<n+d |jvrtd!i| ||<ntd!i| ||<|r0t ||tst||td!i| ||<yyy)"z Prepares the cache for generation (if applicable), given `generate`'s parameterization. If a cache is instantiated, writes it to `model_kwargs`, under the name expected by the model. c3jK|]*}|jjjv,ywr)rrer3)r class_namers rpr!z@GenerationMixin._prepare_cache_for_generation..s*tPZjDNN,C,C,I,I,KKts03)mambafalconh1rVrWrNzMPassing both `cache_implementation` (used to initialize certain caches) and `zB` (a Cache object) is unsupported. Please use only one of the two.zPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `Cache` instead.FzNThis model does not support `Cache` instances. `cache_implementation` (set to z) will be ignored.zRAn assistant model is provided, using a dynamic cache instead of a cache of type='z'. dynamic_fullrTr zUsing `cache_implementation='z(' is deprecated. Please only use one of z9, and the layer structure will be inferred automatically.)rrrr quantizedzThis model does not support the quantized cache. If you want your model to support quantized cache, please open an issue and tag @zucchini-nlp.backendquantozYou need to install optimum-quanto in order to use KV cache quantization with optimum-quanto backend. Please install it via with `pip install optimum-quanto`HQQzYou need to install `HQQ` in order to use KV cache quantization with HQQ backend. Please install it via with `pip install hqq`r r dynamicrn) r rrrrrrrlr rrr from_legacy_cacherrr/rCONTRASTIVE_SEARCHr\r+r,r-rrrr cache_configrr ImportErrorrr) rrrrrr!is_hybrid_cacherJruser_defined_cachedynamic_cache_kwargsr/r)s ` rp_prepare_cache_for_generationz-GenerationMixin._prepare_cache_for_generationrst^stt.=&>  KK * * ]l.>.>?P.QY].] '*--j9  ) 55A cdncopTT,e49]9]9_##F :!223EF,>>?QRZ(   & &% / 335 55A##d(==>>PR  ~AA A!66B   %::;2? 6:  2  B BNDeDef f 55G#% $,dkk.I.IRV.I.W#X  1 1 = 559YY$99=dd''78I8^8^7_`22N1OP22 ,0??):)O)O"#4#>#>@Q@f@fgjtt"2!- ,;, Z( #77;F;;119]9]9_$M BSA_A_Ak0==qs </-1[[-H-H-JL*&**9h?h&/J/L%\%.>.@%H,:+Z'+Z\+Z Z("77;F+7+`:N+`[_+` Z(/DDD+7+O:N+O Z((4'K6J'KL $ **\*=UWj2k':Z(434(L $3l )rocdttj|jjj vS)z Return True if the current model supports the keyword argument `logits_to_keep` in forward() to save memory. Checking it in this way allows to avoid using a new model attribute. logits_to_keep)rrrrrr)rs rp_supports_logits_to_keepz(GenerationMixin._supports_logits_to_keeps2  3w'8'8'F'Q'Q'V'V'X#YYYrokwargs_has_attention_maskcd fd }||j|}||j|}||j|}||j|}jj r||n|}| |j dk(r|jd}|9|7||stjd|d}tjd|djj r | td|4t|| jr||stjd |Atj|s|dkjrtjd |d ||_||_||_||_y)a Prepares the special tokens for generation, overwriting the generation config with their processed versions converted to tensor. Note that `generation_config` is changed in place and stops being serializable after this method is called. That is no problem if called within `generate` (`generation_config` is a local copy that doesn't leave the function). However, if called outside `generate`, consider creating a copy of `generation_config` first. Nc||S||n j}t|tjr|j |Stj ||tj S)Nrr)rrrirrHtensorr)tokenrrs rp_tensor_or_nonez@GenerationMixin._prepare_special_tokens.._tensor_or_nonesQ} %1Vt{{F%.xx''<<fEJJG GrorprzThe attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.z)Setting `pad_token_id` to `eos_token_id`:z for open-end generation.z\`decoder_start_token_id` or `bos_token_id` has to be defined for encoder-decoder generation.rzThe attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.z;`eos_token_id` should consist of positive integers, but is zq. Your generation will not stop until the maximum length is reached. Depending on other flags, it may even crash.r)rr r r,rrrrErrrrr rriis_floating_point_bos_token_tensorrr_decoder_start_token_tensor) rrr8rr>bos_token_tensoreos_token_tensorpad_token_tensordecoder_start_token_tensors ` rp_prepare_special_tokensz'GenerationMixin._prepare_special_tokenss  H++<+I+IRXY*+<+I+IRXY*+<+I+IRXY%45F5]5]fl%m" ;; ) ).H.T*Zj '  ',<,A,AQ,F/99!<   #(8(D(4=Vq 02  NNFGWFXXqr s ;; ) ).H.Pn   (!+;K[\``b(4=V##C  '  # #$4 5:JQ:N9S9S9U NNMN^M_`rr /?+.>+.>+8R5rocX|jry|jjdk(xs/t|jduxr|jj }t |jdtxr|dj}|xr|}t|dd||jjz}t|drGt|jj}d|vxrt!|dkD}|| z}d |v}|| z}|j|st"j%d |S) zp Determines whether to trigger auto-compilation of the model's forward pass at generation time. FcudaNrV hf_quantizerrcpur!diskzsYou have set `compile_config`, but we are unable to meet the criteria for compilation. Compilation will be skipped.)disable_compilerr`boolcompile_config_compile_all_devicesrrr rrrIrrrrrrr) rrrvalid_hardwareusing_compilable_cache can_compileall_model_deviceshas_cpu_offloadhas_disk_offloads rp_valid_auto_compile_criteriaz,GenerationMixin._valid_auto_compile_criteriaXsH  , ,))V3 t  , ,D 8 r=N=]=]=r=r8  |''(9:E B u|TeGfGuGu %?)?  4 . : 4,,;; ;K 4 ) #D$6$6$=$=$? @ #'88WSAR=SVW=WO . .K &)::  // /K  + + 7   #  rocustom_generatec(| dt|x}vrytj|jj ddj d|d|d|s9t |jj ddj d|d |S) zP Returns the Hub repo for a deprecated generation mode, if any. N/r z6 was moved to a `custom_generate` repo: https://hf.co/zC. To prevent loss of backward compatibility, add `custom_generate='z*'` to your `generate` call before v4.62.0.zY requires `trust_remote_code=True` in your `generate` call, since it loads https://hf.co/r)GENERATION_MODES_MAPPINGrrnamereplacetitler)rrrrWrepos rp_get_deprecated_gen_repoz(GenerationMixin._get_deprecated_gen_repos  &#>VWf>g6gd*h##++C5;;=>>tuytz{PPTvV6 6 !"''//S9??ABC004vQ8  rocf|jdd|jdd||d}|0txs t|xrtjdkDn||d<|j Dcic] \}}| || }}}t |trtjtjjj} tj|jj} | | z } | Dcic]}||vs||j|}}|Scc}}wcc}w)zn Extracts and returns the generation mode related keyword arguments from the provided kwargs. rNrQ)rrQrOrr! synced_gpus)rrrdistget_world_sizerrrrrrr[rr) rrWrrbrOrrrrusual_mode_kwargscustom_generate_kwargsnew_custom_keyss rp_extract_generation_mode_kwargsz/GenerationMixin._extract_generation_mode_kwargss2 K6#)::.CT#J. " "( ) I-CD-I htObObOdghOh }- 4J3O3O3Q!c41aUVUb!Q$!c!c ox 0 ' 1 1/2I2I J U U Z Z \ %,%6%6%G%R%R%W%W%Y "47HHO@O%_1STX^S^aA&6%_ "%_%%"d&`s2 D(=D( D.D.rbrrSc  \ | jdd}| tt| trdhd}tj Dcic] \}}||vs ||}}}|j | |j | fd|i| }|d$d|i|S|j| | |||}|j|| fi| \}}|j|}t| tr| }ntt|t|}|j|j|j!||||j#||| x}r$t%j&|f||||||| | | ||d || S||n t)}||n t+}dt-t/j0|j2j4j7v}d|v}|j9dddu}|j;||j<|\}}}dt/j0|j4j7vr||d<|j>d }|j@}|jC||| |jDjFsj|jH^|d kDrYtK|j>d k(rAtMjN|ddd f|jHk(d kDrtPjSd|jDjFs |dk(rd|_*|s|r|r|jW||||d<n-|r+|dk(r&tK|dj>d kDr tYd|jDjFrd|vr|j[||||}|jDjFr.|j]||||j^|j@\} }n|dk(r|n|jd} |j`d$| tc|jd|jf|jDjFd|\} }|jhr!|jk| |j9d} ||jm| jo| j>d }!| j9dduxr|jpdu}"| j9dduxr|jrdu}#|ju||"|#|||!}|jwr d|vrd |d<|jy||!|"|jpd z }$|j>d |!k7r-|dk(r(|jDjFs|$|j>d z }$|j{|||||$|j@j| j@jk7r`t}j~d| j@jd|j@jd|j@jdt|j||!||||j@|| |  }%|j|||j9d}&|jT|d <||| f|%|&|d!||}'|jdurAt|'d"r5t|'jd#|'jj|'_E|'Scc}}w)%a Generates sequences of token ids for models with a language modeling head. Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the model's default generation configuration. You can override any `generation_config` by passing the corresponding parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`. For an overview of generation strategies and code examples, check out the [following guide](../generation_strategies). Parameters: inputs (`torch.Tensor` of varying shape depending on the modality, *optional*): The sequence used as a prompt for the generation or as model inputs to the encoder. If `None` the method initializes it with `bos_token_id` and a batch size of 1. For decoder-only models `inputs` should be in the format of `input_ids`. For encoder-decoder models *inputs* can represent any of `input_ids`, `input_values`, `input_features`, or `pixel_values`. generation_config ([`~generation.GenerationConfig`], *optional*): The generation configuration to be used as base parametrization for the generation call. `**kwargs` passed to generate matching the attributes of `generation_config` will override them. If `generation_config` is not provided, the default will be used, which has the following loading priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s default values, whose documentation should be checked to parameterize generation. logits_processor (`LogitsProcessorList`, *optional*): Custom logits processors that complement the default logits processors built from arguments and generation config. If a logit processor is passed that is already created with the arguments or a generation config an error is thrown. This feature is intended for advanced users. stopping_criteria (`StoppingCriteriaList`, *optional*): Custom stopping criteria that complements the default stopping criteria built from arguments and a generation config. If a stopping criteria is passed that is already created with the arguments or a generation config an error is thrown. If your stopping criteria depends on the `scores` input, make sure you pass `return_dict_in_generate=True, output_scores=True` to `generate`. This feature is intended for advanced users. prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], list[int]]`, *optional*): If provided, this function constraints the beam search to allowed tokens only at each step. If not provided no constraint is applied. This function takes 2 arguments: the batch ID `batch_id` and `input_ids`. It has to return a list with the allowed tokens for the next generation step conditioned on the batch ID `batch_id` and the previously generated tokens `inputs_ids`. This argument is useful for constrained generation conditioned on the prefix, as described in [Autoregressive Entity Retrieval](https://huggingface.co/papers/2010.00904). synced_gpus (`bool`, *optional*): Whether to continue running the while loop until max_length. Unless overridden, this flag will be set to `True` if using `FullyShardedDataParallel` or DeepSpeed ZeRO Stage 3 with multiple GPUs to avoid deadlocking if one GPU finishes generating before other GPUs. Otherwise, defaults to `False`. assistant_model (`PreTrainedModel`, *optional*): An assistant model that can be used to accelerate generation. The assistant model must have the exact same tokenizer. The acceleration is achieved when forecasting candidate tokens with the assistant model is much faster than running generation with the model you're calling generate from. As such, the assistant model should be much smaller. streamer (`BaseStreamer`, *optional*): Streamer object that will be used to stream the generated sequences. Generated tokens are passed through `streamer.put(token_ids)` and the streamer is responsible for any further processing. negative_prompt_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): The negative prompt needed for some processors such as CFG. The batch size must match the input batch size. This is an experimental feature, subject to breaking API changes in future versions. negative_prompt_attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Attention_mask for `negative_prompt_ids`. use_model_defaults (`bool`, *optional*): When it is `True`, unset parameters in `generation_config` will be set to the model-specific default generation configuration (`model.generation_config`), as opposed to the global defaults (`GenerationConfig()`). If unset, models saved starting from `v4.50` will consider this flag to be `True`. custom_generate (`str` or `Callable`, *optional*): One of the following: - `str` (Hugging Face Hub repository name): runs the custom `generate` function defined at `custom_generate/generate.py` in that repository instead of the standard `generate` method. The repository fully replaces the generation logic, and the return type may differ. - `str` (local repository path): same as above but from a local path, `trust_remote_code` not required. - `Callable`: `generate` will perform the usual input preparation steps, then call the provided callable to run the decoding loop. For more information, see [the docs](../../generation_strategies#custom-generation-methods). kwargs (`dict[str, Any]`, *optional*): Ad hoc parametrization of `generation_config` and/or additional model-specific kwargs that will be forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*. Return: [`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True` or when `config.return_dict_in_generate=True`) or a `torch.LongTensor`. If the model is *not* an encoder-decoder model (`model.config.is_encoder_decoder=False`), the possible [`~utils.ModelOutput`] types are: - [`~generation.GenerateDecoderOnlyOutput`], - [`~generation.GenerateBeamDecoderOnlyOutput`] If the model is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible [`~utils.ModelOutput`] types are: - [`~generation.GenerateEncoderDecoderOutput`], - [`~generation.GenerateBeamEncoderDecoderOutput`] rN>rrrWrglobal_keys_to_excludemodel) rrrNrrfrOrgrhrrWrrrrrrpr!r rzA decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.rTrz1`attention_mask` passed to `generate` must be 2D.)rrrr,r)rr:rrrVr)rrrrrrr6z~You are calling .generate() with the `input_ids` being on a device type different than your model's device. `input_ids` is on z, whereas the model is on z. You may experience unexpected behaviors or slower generation. Please make sure that you have put `input_ids` to the correct device by calling for example input_ids = input_ids.to('z ') before running `.generate()`.) rrdrerfrNrrrgrh)rrrr)rNrrrVto_legacy_cachern)Grrstrlocalsrrrrhrget_generation_moderrr`r[rrrr`rrr:rOrrrrrrrrrrrrFrrrrrirrrrrrr+r9rAr@rrr token_healing heal_tokensputrJrVrrr7rr4r{r|r}rrreturn_legacy_cacherrVrl)(rrrrNrrfrbrOrrgrhrrWrrrjrrgenerate_argumentsrrrrdecoding_methoddeprecated_mode_repoaccepts_attention_maskrequires_attention_maskr8rrrrrrrrr!prepared_logits_processorprepared_stopping_criteriaresults( rprzGenerationMixin.generatesf#JJ':DA  &:os+K& "@Fx~~?O!ueSV^tSt#u*!u !u  % %f -'@t'@'@(3D(HN( $,M$M:LM M"&!E!E      " +J$*I*I 1+ 5;+ '<,??P ox 0-O&d4j2J?2[\O ##L$5$5$78 &&8IKab $(#@#@Rcet#u u  u"++"3!1"3)A /$7/M#5 4"3) $0@/K+QdQf1B1N-ThTj!1S9J9J4<<9X9c9c9h9h9j5k!k"3<"G$0$4$45Et$LTX$X!9=8R8R %22L9 5 ' g//@KKPPR R6C "? 3"((+ %% $$%68QZ`$a{{--"33?N ++,1IImArE26G6Y6YYZ]^^l{{--2Bo2U*.  '(-DI_-1-X-X0,.L) *';.3|DT7U7[7[3\_`3` !TUU ;; ) ).?|.SNN|-=?PL ;; ) )&*&T&T%!1)'8'T'T$++ 'U' #I|*:[)H lN^N^_jNkI#E$"D"D# -779J9_9_`#{{==#  #  <  * *((4J4N4N{4[\I   LL )%??1-!'L!9T!A!nFWFbFbjnFn!'L!9T!A!nFWFbFbjnFn ::/#9#9-'- ;   ( ( */?|/S-.L) * ''(9;KMcd -77!;    "&6 6 O3KK22  3 3A 6 6  ** |_jJZ  ;;  y//44 4 MM@@I@P@P@U@U?VW++**+,TTXT_T_TdTdSef* *  %)$>$>/!1+%=- ''% 3+I%? % !&*%@%@//,00=&A& "%6$?$? [!!   78/  %     1 1T 9 12..0ABN%+%;%;%K%K%MF " y"vs  Z(Z(this_peer_finishedc|r_tj|rdnd|}tj|tjj |j dk(ryy|ryy)z Returns whether there are still unfinished sequences in the device. The existence of unfinished sequences is fed through `this_peer_finished`. ZeRO stage 3-friendly. rvrnrp)opFT)rir<rc all_reduceReduceOpSUMr6)rr|rbrthis_peer_finished_flags rp_has_unfinished_sequencesz)GenerationMixin._has_unfinished_sequences s^ ',ll:L3RU^d&e # OO3 8I8I J&++-4 roc tdjj}}tj }t d|}j |dDcgc]}|j}}|ddjj|j}tj||k(||}|jd k(r|S|ddd fj} jd 0j!jd d fd | D} n fd | D} t#t%| | D]\} \} } || }tj&||k(j)r5 |j+| Dcic]}j|fd}}t-|dk(ru|| fxxdz cc<|j/||dd } |jd k(rt-|||k7dk(r||d <|j1|j3d ||| <|Scc}wcc}w)a Generates sequences of token ids for models with a language modeling head. Parameters: input_ids (`torch.LongTensor`): The sequence used as a prompt for the generation. tokenizer (`PreTrainedTokenizerBase`, *optional*): The tokenizer used to decode the input ids. Return: `torch.LongTensor` where each sequence has its tail token replaced with its appropriate extension. Nzs When generating with token healing, you must pass the model's tokenizer to the `tokenizer` argument of `generate`.r!)rr T)skip_special_tokenspt)return_tensorspaddingrrrZc3`K|]%}j|jd'yw)rZN)decoder])rt space_tokrs rpr!z.GenerationMixin.heal_tokens..T s)W))!,44S)DWs+.c3@K|]}j|ywr)r)rrrs rpr!z.GenerationMixin.heal_tokens..V s?))!,?r")prefixg$@rnrl)r)rrr r get_vocabr. batch_decodestriprrHrriwherenumeltolistconvert_tokens_to_idsconvert_ids_to_tokens enumeratezipr5r6 extensionsrrrrE)rrrrr  vocab_trierrpromptstail_ids tail_toks batch_idxtail_idtail_tok batch_idsalt_tokseq_bias trimmed_idsrs ` @rprqzGenerationMixin.heal_tokens) s  *  &/%;%;Y=S=Sl #I$7$7$9: ,ALY'0&<&. s'lh.(I'lsrnNrrcrdr Fr0TOKENIZERS_PARALLELISMflash_attention_2zWhen using Flash Attention 2 and a static cache, you cannot use the option `CompileConfig(fullgraph=True)` as FA2 introduces graph breaks. We overrode the option with `fullgraph=False`.Trpr%rrrrrr0r! num_samplesrrV r`rarbrsrtrurvrwrVr`rarbrcrdrV)5rr#r$ output_scores output_logitsreturn_dict_in_generater r]rrrrrirrrr__call__rVrenviron_attn_implementationrN fullgraphrrget_compiled_callprefill_chunk_size_prefill_chunkingrrrMrbrHfloat32rurcrvrwrdr rsoftmax multinomialsqueezeargmaxr7rrrJrendrrr_)$rrrNrrrbrrr r#r$rrrhas_eos_stopping_criteriar]ra raw_logitsrurvrwrsrtrcur_lenr|unfinished_sequences model_forwardcompile_forward is_prefillrrAnext_token_logitsnext_token_scoresprobs next_tokenss$ rpr[zGenerationMixin._sample~ s*V):: -??0EE)77 )77 "3"K"K$''lZk'l$l!%// 0M3 RD $;@QRX\"9>O2VZ'>CW^b #t{{'E'EVg.?!@!D!D\!Rmq H\ ./33ODbf " (oobq1 G"$zz*EJJyO_O_`77AQAQS_`  ;;LJ[\ 36BJJ/ 0{{//3FF$33?DUDdDdDnDn''fBG%44> 223D3S3STM  / / ;1411)=N_R^_LJJ,,-?U^UeUe,f=4==iX<XL@@4@" 'I,IDI CC#';;#A#ADL 1!(q"ax 8 ; ;U]]clcscs ; t !1 [batch_size * num_beams, ...]rr!r Nrrrir)r<rs rp_flatten_beam_dimz!GenerationMixin._flatten_beam_dim> s>V\\"}}VeAhq&9%:U12Y%FGGrorcht|j}tj|||g|ddzS)z=[batch_size * num_beams, ...] -> [batch_size, num_beams, ...]r!Nr)r<rrrs rp_unflatten_beam_dimz#GenerationMixin._unflatten_beam_dimD s3V\\"}}Vj)% Use a heuristic that assumes the best score comes from the current length minus the decoder prompt length. -> See detailed discussion: https://github.com/huggingface/transformers/pull/20901#issuecomment-1369845565 2. `early_stopping == "never"`: -> Estimate the best score using either `max_length` or `cur_len`, depending on the sign of `length_penalty`. -> A positive length penalty favors longer sequences, so we use `max_length` in that case. NOTE: the canonical beam search implementation can be replicated with `early_stopping="never"` and `length_penalty=0.0`, which are NOT the default flags. The default behavior was empirically found to produce better sequences (prior to 2022), and changing it is BC breaking. neverrvNr!T)r1keepdimrer)rirrr ) rrrrrrVrrrbest_hypothetical_lengthbest_possible_running_scoreworst_finished_scores rp_check_early_stop_heuristicz+GenerationMixin._check_early_stop_heuristic^ sJ W $#)='14F'F $'.1C'C $&9!RaR%&@D\^lDl&m#${{+;UYY{XYcg=hij=kmst2UYY '*> >BPT6   ro!next_token_hits_stopping_criteriactj|}tj||duz}tj|}||z|zS)zv Beam Search stopping condition -- halts the generation loop if any of these conditions becomes False T)rir r5)rrrrimprovement_possibleexists_open_beamvalid_continuationss rp%_beam_search_has_unfinished_sequencesz5GenerationMixin._beam_search_has_unfinished_sequences sZ %yy)LM#YY'78Nd!$($6$67KMf$g!!%!3!34EG`!a*,19q!W}-||JxGLLRQRSV__ !:\!IH^!!Q2D(D"DE57PPProrrrc||jtjdzz}tj||d}|j ||}|j ||} |j ||} || | fS)z Given the top-K continuations, their scores, and whether they hit a stopping criteria, select the best non-finished beams to continue beam search in the next iteration. rrr!)rHrirrr) rrrrrrtopk_running_log_probsnext_topk_indicesrrrs rp%_get_running_beams_for_next_iterationz5GenerationMixin._get_running_beams_for_next_iteration s"02S2V2VW\WdWd2ehn2n!n!JJ'=KAN ../EGXY"001GIZ[#112KM^_ "57KKKrotop_num_beam_maskc| | dddfz}|| dz| z |zz }tj|dd|duz}||jtjdzz }||jtjdzz }||dzz }tj||fd}tj||fd}tj||fd}tj||fd}tj || d}|j ||}|j ||}|j ||}|j ||}||||fS) z Updates the finished beams if (and only if) there are new completed sequences that have a higher score than the current finished sequences. Nr!rT)axiskeepdimsrr0r)rir5rHrr7rr)rr`rrrr{rrrrrrrrrrdid_top_num_beams_just_finishedbeams_in_batch_are_fullmerged_sequences merged_scoresmerged_beam_indicesmerged_is_sent_finishedtopk_merged_indicess rp_update_finished_beamsz&GenerationMixin._update_finished_beams s2+LN_`dfg`gNh*h'(GaK:L,LQ_+_`"')),<2PT"UYgkoYo"p144U]]CfLL??CCEMMRU[[[ ;;vEE !99i1G%HaP ;"?QG #ii7P(QWXY"'))-=?^,_ef"g#jj)DQG&&'79LM ((8KL ))*=?RS --.EGZ[+|5EEEroc |j}|j}|j} |j} |j} |j } |j } |j}|j}|j}|j}|j}|j}|jdd\}}||z}|jjdk(r|j j"}n|jjdk(r|j%j&}nT|jjdk(r|j j(}n$|j j+j,}|}d}||jdnd}t/dd|z|z}t1j2t1j4|t0j6 t1j8||z t0j6 fd j;|j<}|j?||j<|}|j@}|r tCd | r| rd nd}| r| rd nd}| r| rd nd} | r| rd nd}!| r| rd nd}"| r| rd nd}#| rF|j jDr0| r|d jGdnd}$| r|d jGdnd}%| |xs|dnd}&t1jH|||f|&t0jJ|j<}'|jM||||'ddddd|f<|'jOjQ}(t1j8||ft0jR|j<})d|)ddddf<t1jH||fdt0jR|j<}*t1j8||ft0j6|j<}+t1j4|dft0j6|j<},t1j8||ft0j6|j<}-t1jH||||z fdt0jT|j<}.|.jOjQ} |jW|||j<r!|jY|'ddddd|f}/|jZ|/fi|}0|d i|0ddi}1|j]|1||j jD}|r|r|1j^dddddfj;dt0j`|j<}2tbjdjg|2d }3||/|3}3| r| r||2jQfz }| r| r||3jQfz }| rY|!|j jDr |1jhfn |1jjfz }!|j jDr|"|1jlfz }"| r3|#|j jDr |1jnfn |1jpfz }#~1|jM|3||}3|3|)dddddfz}3t1jr|3|||zf}3|ju|3|'|.||||||| \}4}5}6||jY|5ddddd|dzf|}-|jM|-||}-|jw|4|5|6|-|\}'})}.|jy|(|5|*|4| |6|,|+|-||||||\}(}*} }+|jGddR|jY|.d||z f}7t{|dr|j}|d|7|d<n|dj|7|dz}|j|,|)|*|+||||| },|j|,|+|-| }|jW|||j<r!|jY|(ddd|ddf}(|jY|*ddd|f}*|jY| ddd|ddf} | dzj7jd j/}8||8z}9|(ddd|9f}(| ddd|8f} | ra| sd}*|j jDr%t|(|*||| $%|!|"|#|jGd St|(|*||| |!|#|jGd!S|(S)"a Generates sequences of token ids for models with a language modeling head using **beam search decoding** and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models. If it's the first time you're diving into Beam Search, we recommend you read the following blog post: https://huggingface.co/blog/how-to-generate (especially the beam search section). You can recompute the sequence scores from the individual scores using the `compute_transition_scores` function (https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationMixin.compute_transition_scores) Parameters: input_ids (`torch.LongTensor` of shape `(batch_size*num_beams, sequence_length)`): The sequence used as a prompt for the generation. logits_processor (`LogitsProcessorList`): An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`] used to modify the prediction scores of the language modeling head applied at each generation step. stopping_criteria (`StoppingCriteriaList`: An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`] used to tell if the generation loop should stop. generation_config ([`~generation.GenerationConfig`]): The generation configuration to be used as parametrization of the decoding method. synced_gpus (`bool`): Whether to continue running the while loop until max_length (needed to avoid deadlocking with `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3). model_kwargs: Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is an encoder-decoder model the kwargs should include `encoder_outputs`. Return: [`generation.GenerateBeamDecoderOnlyOutput`], [`~generation.GenerateBeamEncoderDecoderOutput`] or `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a [`~generation.GenerateBeamDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and `return_dict_in_generate=True` or a [`~generation.GenerateBeamEncoderDecoderOutput`] if `model.config.is_encoder_decoder=True`. Nr MoshiDepthDecoderImageGPTForCausalImageModelingBarkSemanticModelFrr!rr0z`low_memory=True` is not supported after the beam search refactor. Please check the discussion in #35802 *after the PR got merged*, and add a comment there if your questions are not yet answered.rnrrcrdr) fill_valuerrrrrpr%Trr) rrrrrr]rrrWr)rrrrr)r`rrrr{rrrrrrrrrrrV._reorder_cache) rrrrrrVrrr) r`rzrarbr{rsrtrurvrwrV)r`rzrarbr{rcrdrV)Errr#r$rrrr]rrrVrrrrreraudio_vocab_sizeget_output_embeddings out_featuresoutput_vocab_sizer\rWrrir7rrMzerosrHrr low_memoryrrrfullrrdetachrfloatint32rrrrMrbrr rrrurcrvrwrdrrrr rr reorder_cacherrrr}ry):rrrNrrrbrr r r#r$rrrr]rrrVrrbatch_size_unflattenedrrrWrr| n_eos_tokensrr sequential all_scoresrr{rurvrwrsrtoutput_fill_valuerr`rrrrrrflat_running_sequencesr model_outputsrb log_probsrrrbeam_idxmax_generated_length output_lengths: rpr\zGenerationMixin._beam_search' s \):: (:: -??0EE)77 )77 "3"K"K%// *99*99&11 %// 0EE*3//"1*='+y8 >> " "&9 955J ^^ $ $(H H335BBJ ^^ $ $(; ;66J446AAJ$" 1=0H|))!,a Aq</09< !II ZZ5:: 6 ]U^E^glgqgq8r s  "Y     77AQAQS_` '11 t  4 RD 3 RD 5-rd $;@QRX\"9>O2VZ'>CW^b #t{{'E'EVg.?!@!D!D\!Rmq H\ ./33ODbf "@L?WL;LO]_!JJ J /(++##   -1,D,DYPZ\e,f!Q.)%,,.446 $kk:y*A]f]m]mn%)AqrE"jj*i!8TQVQ\Q\eneueuv !;; I'>ejjYbYiYij/4jj*aPUPZPZclcscs.t+-2KK  #5::i>N>N- ) %zz J$8 9bPUP[P[dmdtdt ,224::< ,,-?U^UeUe,f%)%;%;TeXdeL B<BTBM CC#';;#A#ADL 1#))!R(366D ^g^n^n6oF 11&b1AI()?KI' 6<<>"33J*}9??#4"66J$&;;99'99;+668& {{55(]-K-K,MM(');;99'<<>+99;)00J RI!$71d $CCI i*i*>T1UVIQUPmPm&/"3%9#5#+#%%Qn Q MN24M1B&&'=aMgPQkM>Q'RS1 -150H0H1:}1 - LPKuKu-'=*C2S# LvL H 24HFJE`E`#'='-)*C4W!12S"3##5--FaF BI{L2B. 148D112FsGVhLhGh2ij4!126:6I6I,WhJiks6tL!23 !23AA(KkG262R2R4W$7'!1%#5--3S 3 /&*%O%O3 1 &" G,,-?U^UeUe,fX**9Q8M9M8Mq5P+QR ,,[S?S>SUV;V.WX ".!1 7 7 9>>1>EIIK*-AA a-/0 #A'<(<'<$<= " " {{--7'%0%%!-'9*?'9%5*?$0$4$45F$G  5'%0%%!-1"7$0$4$45F$G   roc ./| ds td|jdvs2d| vr9t| ddr*td| djDr td|j |||||| | | } |j } |j}|j}|j}|j}|j}|r|rd nd }|r|rd nd }|r|rd nd }|r|rd nd }|r|rd nd }|rF|jjr0|r| d jd nd }|r| d jd nd }|jd d\}}|dkDr tdt!j"|t j$|j&}|j)||j&| } d}d}|j+|||j&r|jd}| j-|\}} |j/|j&}| | j/|j&} |jd|jdz }!||d }"t1j0| }#t3|#|jd|jj}#t5|#|jd}#d|#vrQt!j6|#dt!j8|||!z|j&t j$fd|#d<|j:|fi|#}$d|$vr|!dz|$d<|d i|$}%|%j<d d |! dz d fj/t j>|j&..jA/tC|dkDr. shAGA/7hsz=assisted generate is not supported with Static cache classes`)rrrrOrNrPrQrrnNrrcrdr r!z6assisted generate is only supported for batch_size = 1rFTrprr;rr0r6rr)rrBc36K|]}dd|ddfywrrn)ri new_logitss rpr!z5GenerationMixin._assisted_decoding..Ps#[AJq!Qw$7#[c36K|]}dd|ddfywrrn)rr.rs rpr!z5GenerationMixin._assisted_decoding..Rs'fq(9!Q'(B'fr0)is_decoder_attentionrO heuristicrr)?rrrr r*rcr]r#r$rrrrrrrrirrrrrget_candidatesrHrr)r*r7rGrrbrrrrange_speculative_samplingrrrrrrrrrJrVcropupdate_candidate_strategyrMrl_split_model_outputsrvrurcrwrdrrrOrnum_assistant_tokens_schedulenum_assistant_tokensrrr_)0rrrNrrrbrrrOrQrrrbr]r#r$rrrrarrurvrwrsrtrrrr|is_first_iterationcandidate_input_idscandidate_logitscandidate_lengthis_done_candidatecandidate_kwargsrrAr. valid_tokens n_matchesrselected_tokenscandidate_new_tokens new_cur_lennewly_added_lengthr/rs0 @@rpr]z"GenerationMixin._assisted_decodingu stK(JK K  1 15[ [  - %67BhN_A`AgAghh\] ]";;/'+-& 3%<  &// -??0EE)77 )77 "3"K"K0M3 RD $;@QRX\"9>O2VZ'>CW^b #t{{'E'EVg.?!@!D!D\!Rmq H\ ./33ODbf " (oobq1 G >UV V$zz*EJJyO_O_`77AQAQS_` "!,,-?U^UeUe,fooa(G5H4V4VW`4a 1 !1"5"8"8"E +#3#6#6t{{#C 288;iooa>PP  12Et L  $yy6 6 "5";";A"> @^@^   77GI\IbIbcdIef #335:YY()9: Wg8H.HQZQaQainisist 6 !12>4==>QfUefL</1AA1E -.*\*G!,<+;(  ;(z (## (  1 12 (!!1!12 ( u  %"2"22 3 (D=(##=(  1 12=(!!1!12 =( u  %"2"22 3 =(D,0595959 M##M"%M!!1!12 M   1 12 M !!1!12 Mb*./3:> A0&A0u||,A0tC$567 A0 u||Xc]Dell1B,CC D A0J*./3:> `&`u||,`tC$567 `    `@"||","38n "    "H'||'#3- ' , ' c3h '^*. 9/9/9/3 ,- 9/ !& 9/ & 9/ uc5<<&7!88 99/v#(04 ' '  'E,,- ' uc3h/ 0 ' 'L$) //38n/! /  / c3h /p8<@DCGP#+P###P#|| P# . P# 38n P#""34P###<=P#&&?@P# P#j/38 $156:AE[+['sm[$E$4$45 [ #+8S%,,4Gc4R+S"T [ ##67 [ [tCH~.[&ell3[)1(>[ [B:> $+$$$89$56 $  $L#/1EEF#.0DDE# "$88 9 #R04!& x!<<x!ell#x!u||, x!  x!  x!t,\04S>0d*X6!v.2i/#$45i/%TNi/ i/ % & i/V:,s,,TW,jo,\   &K+KK( K  K  K KZZ$Z5959 PS+PS$,D>PSu||S012 PSd'c3h'\l'qu'Z*. ' "#  # 0& c3h&@U]]_*.8<:><@W[&*7;-16:AE-1:>[&[$$45[##67 [ $$89 [ #+8S%,,4Gc4R+S"T [d^[""34[>*[&ell3[)1(>[%TN["%X "67[ ~u/// 0[[z Dt]b]i]inr&]aS))S6>?X6YS   Sv"-1~##~.~0 ~ , ~  ~>*~ $e&6&66 7~@H%,,H5<<HH JELLJcJcJV[VbVbJJ ell%,,5<<&, -2\\, "\\, \\,  ,, ,  ,  ,  , dCi(, , , \M-2\\M,,M,1<<MdCi( MM,4Q$||4Q!<<4Q$ll 4Q  4Q  4Q4Q4Q4Q4Q4Q u||U\\5<<7 84QlL L!& L$)<< L ,1<< L  L u||U\\5<<7 8L,3F<<3F!& 3F\\ 3F  3F ll 3F$)<<3F.3\\3F ,,3F,1<<3F!<<3F3F3F 3F3F dCi(!3F" u||U\\5<<E F#3Fz" L##L.L0 L , L  L !5#3#33 4Lh "-1597;CG9=d##d.d0 d , d  d>*d  1 12d""34d&&?@d56d $e&6&66 7dL *5+;+;*P`*rorcd|dd| df}|jd}|ddtj||fjdd}|jd}|ddtj||fjdd} | |z } tj| } | | k} | j ddkj } |r| |k(r| dz} |ddd| dzf}|| fS|jd}|dd| ddf}| |krF|dd| ddf}tj||z d}|j|j n|}tj|djddddf}| dkDr&tj|ddd| f|fd}|| fS|}|| fS)a Applies sampling as in the speculative decoding paper (https://huggingface.co/papers/2211.17192, algorithm 1). Returns the selected tokens, as well as the number of candidate matches. NOTE: Unless otherwise stated, the variable names match those in the paper. Nrr0rr!)rr) rrirGr rand_likerrrclampdiv_rr7)r=r>r?r/r@new_candidate_input_idsqq_irp_iprobability_ratior_i is_acceptedrCrBgamma p_n_plus_1 q_n_plus_1p_primers rpr6r6s2!6F5F5G2GH   R (A Au||,-/F F G O OPQST UCr"A Au||,-/F F G O OPQST UCc  //+ ,C**K,&&2&.2779IY*:: Q .q/IM//AB &  ""!!&&q)q)Q' u 1i?+Jkk: #:CG LL ' G   g1 5 = =a @q I q= 99&=a)m&La%PVXYL  ""L  ""roc*t|dk(r