L i4eddlZddlZddlZddlmZmZddlmZddlm Z ddl m Z m Z m Z mZmZddlZddlmZddlmZmZmZmZmZmZmZmZddlmZmZm Z m!Z!m"Z"dd l#m$Z$dd l%m&Z&dd l'm(Z(m)Z)m*Z*e(r,dd l+m,Z,dd l-m.Z.ddl/m0Z0m1Z1ddl2m3Z3ddl4m5Z5ddl6m7Z7ddl8m9Z9e)rddl:Z:e!jve<Z=dZ>dZ?Gdde@eZAe*dGdde&ZBy)N)MappingSized)Enum)Path)AnyCallableOptionalUnionoverload) load_audio_as) LARGE_INTEGERVERY_LARGE_INTEGER BatchEncoding EncodedInputPreTokenizedInputPreTrainedTokenizerBase TextInputTruncationStrategy)PaddingStrategy TensorTypeadd_end_docstringslogging to_py_obj)is_torch_tensor)PushToHubMixin)is_mistral_common_availableis_torch_availablerequires)ChatCompletionRequest)ValidationMode)SpecialTokenPolicyTokenizerVersion)MultiModalVersion)MistralTokenizer) Tekkenizer)download_tokenizer_from_hf_huba add_special_tokens (`bool`, *optional*, defaults to `True`): Whether or not to add special tokens when encoding the sequences. This will use the underlying `PretrainedTokenizerBase.build_inputs_with_special_tokens` function, which defines which tokens are automatically added to the input ids. This is useful if you want to add `bos` or `eos` tokens automatically. padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`): Activates and controls padding. Accepts the following values: - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single sequence is provided). - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum acceptable input length for the model if that argument is not provided. - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different lengths). truncation (`bool`, `str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`): Activates and controls truncation. Accepts the following values: - `True` or `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or to the maximum acceptable input length for the model if that argument is not provided. - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths greater than the model maximum admissible input size). max_length (`int`, *optional*): Controls the maximum length to use by one of the truncation/padding parameters. If left unset or set to `None`, this will use the predefined model maximum length if a maximum length is required by one of the truncation/padding parameters. If the model has no specific maximum input length (like XLNet) truncation/padding to a maximum length will be deactivated. stride (`int`, *optional*, defaults to 0): If set to a number along with `max_length`, the overflowing tokens returned when `return_overflowing_tokens=True` will contain some tokens from the end of the truncated sequence returned to provide some overlap between truncated and overflowing sequences. The value of this argument defines the number of overlapping tokens. pad_to_multiple_of (`int`, *optional*): If set will pad the sequence to a multiple of the provided value. Requires `padding` to be activated. This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta). padding_side (`str`, *optional*): The side on which the model should have padding applied. Should be selected between ['right', 'left']. Default value is picked from the class attribute of the same name. return_tensors (`str` or [`~utils.TensorType`], *optional*): If set, will return tensors instead of list of python integers. Acceptable values are: - `'pt'`: Return PyTorch `torch.Tensor` objects. a return_attention_mask (`bool`, *optional*): Whether to return the attention mask. If left to the default, will return the attention mask according to the specific tokenizer's default, defined by the `return_outputs` attribute. [What are attention masks?](../glossary#attention-mask) return_overflowing_tokens (`bool`, *optional*, defaults to `False`): Whether or not to return overflowing token sequences. If a pair of sequences of input ids (or a batch of pairs) is provided with `truncation_strategy = longest_first` or `True`, an error is raised instead of returning overflowing tokens. return_special_tokens_mask (`bool`, *optional*, defaults to `False`): Whether or not to return special tokens mask information. return_offsets_mapping (`bool`, *optional*, defaults to `False`): Whether or not to return `(char_start, char_end)` for each token. This is only available on fast tokenizers inheriting from [`PreTrainedTokenizerFast`], if using Python's tokenizer, this method will raise `NotImplementedError`. return_length (`bool`, *optional*, defaults to `False`): Whether or not to return the lengths of the encoded inputs. verbose (`bool`, *optional*, defaults to `True`): Whether or not to print more information and warnings. **kwargs: passed to the `self.tokenize()` method Return: [`BatchEncoding`]: A [`BatchEncoding`] with the following fields: - **input_ids** -- List of token ids to be fed to a model. [What are input IDs?](../glossary#input-ids) - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names`). [What are attention masks?](../glossary#attention-mask) - **overflowing_tokens** -- List of overflowing tokens sequences (when a `max_length` is specified and `return_overflowing_tokens=True`). - **num_truncated_tokens** -- Number of tokens truncated (when a `max_length` is specified and `return_overflowing_tokens=True`). - **special_tokens_mask** -- List of 0s and 1s, with 1 specifying added special tokens and 0 specifying regular sequence tokens (when `add_special_tokens=True` and `return_special_tokens_mask=True`). - **length** -- The length of the inputs (when `return_length=True`) ceZdZdZdZdZy)MistralTokenizerTypez)Enum for the different type of tokenizer.spmtekkenN)__name__ __module__ __qualname____doc__r)r*n/mnt/ssd/data/python-lab/Trading/venv/lib/python3.12/site-packages/transformers/tokenization_mistral_common.pyr(r(s3 C Fr0r()zmistral-common)backendsc& eZdZUdZddgZeeed<dZeed<dZ eed<e je ddd d fd e eejefd e d edededeeedefdZedefdZedefdZedefdZedefdZedefdZedefdZedefdZedefdZedefdZdeeeffdZ dZ!e"e#dd drd e e$e%fd!d d"ed#e eee&fd$e eee'd fd%eed&ed'eedeed(ee ee(fd)edeefd*Z) dsd+e eeee*jVd,fd-edeedefd.Z, dsd/e eeeeee*jVd,fd-edeedeefd0Z-d1edefd2Z.e/dtd3ed-edefd4Z0e/dtd3eed-edeefd5Z0 dtd3e eeefd-ede eeeffd6Z0d7edefd8Z1d9e eeefde eeeffd:Z2d e$d"edeefd;Z3d e$deefd<Z4de&jje'jld dd d d d d d d df d e e$e%fd"ed=e&d>e'd%eed&ed'eedeed(ee ee(fd?eed@edAedBed)ede7fdCZ8de&jje'jld dd d d d d d d d dfdDe ee$ee%fd"ed=e&d>e'd%eed&ed'eedeed(ee ee(fd?eed@edAedEedBed)ede7f dFZ9de:efdGZ; dudHedId dJedeefdKZe'd%eed&ed'eedeed(eed?eed@edAedBed)ede7fdMZ>e"e#e? dvd3eedNd d"ed#e eee&fd$e eee'd fd%eed&ed'eedeed(ee ee(fd?eed@edAedBed)edOede7f"dPZ@ dwd#e ee&efd$ee ee'efd%eed'eed)ef dQZAd e&jjd d d fdRe eee%fe7fd%eed=e&d'eedeed?eedefdSZB dxdRe e7ee7eee%feeee%feeee%ffd#e eee&fd%eed'eedeed?eed(ee ee(fd)ede7fdTZC dyd3eedNd dUed>e ee'fd&edeDeed eeff dVZE dzdWe eeeefeeeeeffdXeee eeFfdYedZed#e eee&fd$ed%eed(ee ee(fd[ede eeeeeeeee7ffd\ZGe"e#e? d{d e e$e%ee$ee%d fd!d d]d d^d d"ed#e eee&fd$e eee'd fd%eed&ed'eedeed(ee ee(fd?eed@edAedBed)ede7f$d_ZHeIe jd d d d d`e ddd d da dbe eejfd e dcee eejfddedeedfee eefdged edededeeedefdhZJ d|die eejefdjedfee eefdkeedleedmeedneedoeedeDedpffdqZKy )}MistralCommonTokenizera Class to wrap `mistral-common` tokenizers. `mistral-common` is the official tokenizer library for Mistral AI models. To use it, you need to install it with: ```bash pip install transformers[mistral-common] ``` Otherwise the tokenizer falls back to the Transformers implementation of the tokenizer. For more info on `mistral-common`, see [mistral-common](https://github.com/mistralai/mistral-common). This class is a wrapper around a `mistral_common.tokens.tokenizers.mistral.MistralTokenizer`. It provides a Hugging Face compatible interface to tokenize using the official mistral-common tokenizer. Supports the following methods from the `PreTrainedTokenizerBase` class: - [`~MistralCommonTokenizer.get_vocab`]: Returns the vocabulary as a dictionary of token to index. - [`~MistralCommonTokenizer.encode`]: Encode a string to a list of integers. - [`~MistralCommonTokenizer.decode`]: Decode a list of integers to a string. - [`~MistralCommonTokenizer.batch_decode`]: Decode a batch of list of integers to a list of strings. - [`~MistralCommonTokenizer.convert_tokens_to_ids`]: Convert a list of tokens to a list of integers. - [`~MistralCommonTokenizer.convert_ids_to_tokens`]: Convert a list of integers to a list of tokens. - [`~MistralCommonTokenizer.tokenize`]: Tokenize a string. - [`~MistralCommonTokenizer.get_special_tokens_mask`]: Get the special tokens mask for a list of tokens. - [`~MistralCommonTokenizer.prepare_for_model`]: Prepare a list of inputs for the model. - [`~MistralCommonTokenizer.pad`]: Pad a list of inputs to the same length. - [`~MistralCommonTokenizer.truncate_sequences`]: Truncate a list of sequences to the same length. - [`~MistralCommonTokenizer.apply_chat_template`]: Apply a chat template to a list of messages. - [`~MistralCommonTokenizer.__call__`]: Tokenize a string or a list of strings. - [`~MistralCommonTokenizer.from_pretrained`]: Download and cache a pretrained tokenizer from the Hugging Face model hub or local directory. - [`~MistralCommonTokenizer.save_pretrained`]: Save a tokenizer to a directory, so it can be reloaded using the `from_pretrained` class method. - [`~MistralCommonTokenizer.push_to_hub`]: Upload tokenizer to the Hugging Face model hub. Here are the key differences with the `PreTrainedTokenizerBase` class: - Pair of sequences are not supported. The signature have been kept for compatibility but all arguments related to pair of sequences are ignored. The return values of pairs are returned as `None`. - The `is_split_into_words` argument is not supported. - The `return_token_type_ids` argument is not supported. - It is not possible to add new tokens to the tokenizer. Also the special tokens are handled differently from Transformers. In `mistral-common`, special tokens are never encoded directly. This means that: `tokenizer.encode("")` will not return the ID of the `` token. Instead, it will return a list of IDs corresponding to the tokenization of the string `""`. For more information, see the [mistral-common documentation](https://mistralai.github.io/mistral-common/usage/tokenizers/#special-tokens). If you have suggestions to improve this class, please open an issue on the [mistral-common GitHub repository](https://github.com/mistralai/mistral-common/issues) if it is related to the tokenizer or on the [Transformers GitHub repository](https://github.com/huggingface/transformers/issues) if it is related to the Hugging Face interface. input_idsattention_maskmodel_input_namesleft padding_siderighttruncation_sideNFtokenizer_pathmodemodel_max_lengthclean_up_tokenization_spacesc `|r&tdt|jdt||_t j t|j||_t|jjjtrtjntj|_||_||_||_||_i|_|Ht|tt*fs+t-|dk(rt/d|Ds td||_d|_y)a! Constructs a `MistralCommonTokenizer`. - **model_input_names** (`List[str]`) -- A list of inputs expected in the forward pass of the model. - **padding_side** (`str`) -- The default value for the side on which the model should have padding applied. Should be `'right'` or `'left'`. - **truncation_side** (`str`) -- The default value for the side on which the model should have truncation applied. Should be `'right'` or `'left'`. Args: tokenizer_path (`str` or `os.PathLike` or `Path`): Path to the tokenizer file to load the `MistralTokenizer`. mode (`ValidationMode`, *optional*, defaults to `ValidationMode.test`): The mode to use for the tokenizer. This will be passed to the `MistralTokenizer` constructor. model_max_length (`int`, *optional*): The maximum length (in number of tokens) for the inputs to the transformer model. When the tokenizer is loaded with [`~tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained`], this will be set to the value stored for the associated model in `max_model_input_sizes` (see above). If no value is provided, will default to VERY_LARGE_INTEGER (`int(1e30)`). padding_side (`str`, *optional*): The side on which the model should have padding applied. Should be selected between ['right', 'left']. Default value is picked from the class attribute of the same name. truncation_side (`str`, *optional*): The side on which the model should have truncation applied. Should be selected between ['right', 'left']. Default value is picked from the class attribute of the same name. model_input_names (`List[string]`, *optional*): The list of inputs accepted by the forward pass of the model (like `"token_type_ids"` or `"attention_mask"`). Default value is picked from the class attribute of the same name. clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`): Whether or not the model should cleanup the spaces that were added when splitting the input text during the tokenization process. Kwargs z4 are not supported to init `MistralCommonTokenizer`.)r=Nrc3<K|]}t|tywN) isinstancestr).0is r1 z2MistralCommonTokenizer.__init__..sJ1Jq#.JszV`model_input_names` should be a non-empty list or tuple of str but got an empty value.) ValueErrorlistkeysr_tokenizer_pathr$ from_filerE tokenizerrDinstruct_tokenizerr%r(r*r)_tokenizer_typer;r9r>cleanup_tokenization_spacesdeprecation_warningstuplelenallr7_cache_get_vocab) selfr<r=r>r9r;r7r?kwargss r1__init__zMistralCommonTokenizer.__init__s V wtFKKM':&;;opq q#N3+;+E+Ec$J^J^F_fj+k$..;;EEzR ! ' '%))  /( 0+G($&!  (04-@)*a/J8IJJ l&7D ":>r0returncV|jjjjS)zJ Id of the beginning of sentence token in the vocabulary. )rNrObos_idrWs r1 bos_token_idz#MistralCommonTokenizer.bos_token_id ~~00::AAAr0cV|jjjjS)zD Id of the end of sentence token in the vocabulary. )rNrOeos_idr]s r1 eos_token_idz#MistralCommonTokenizer.eos_token_idr_r0cV|jjjjS)z< Id of the unknown token in the vocabulary. )rNrOunk_idr]s r1 unk_token_idz#MistralCommonTokenizer.unk_token_id&r_r0cV|jjjjS)z< Id of the padding token in the vocabulary. )rNrOpad_idr]s r1 pad_token_idz#MistralCommonTokenizer.pad_token_id-r_r0c8|j|jS)zY String associated to the beginning of sentence token in the vocabulary. )convert_ids_to_tokensr^r]s r1 bos_tokenz MistralCommonTokenizer.bos_token4 ))$*;*;<ZNr0 sequencesc T|Dcgc]}|j|f||d|c}Scc}w)a Convert a list of lists of token ids into a list of strings by calling decode. Args: sequences (`Union[List[int], List[List[int]], np.ndarray, torch.Tensor]`): List of tokenized input ids. Can be obtained using the `__call__` method. skip_special_tokens (`bool`, *optional*, defaults to `False`): Whether or not to remove special tokens in the decoding. clean_up_tokenization_spaces (`bool`, *optional*): Whether or not to clean up the tokenization spaces. If `None`, will default to `self.clean_up_tokenization_spaces`. kwargs (additional keyword arguments, *optional*): Not supported by `MistralCommonTokenizer.batch_decode`. Will raise an error if used. Returns: `List[str]`: The list of decoded sentences. )rr?)r)rWrrr?rXseqs r1 batch_decodez#MistralCommonTokenizer.batch_decodesJ@!  DKK $7-I      s%token_idc`|jtjk(r0||jjjj vS|jtj k(r-||jjjjkStd|j)NUnknown tokenizer type: ) rPr(r)rNrO_control_tokensr*num_special_tokensrI)rWrs r1_is_control_tokenz(MistralCommonTokenizer._is_control_tokens   #7#;#; ;t~~@@JJZZ\\ \  ! !%9%@%@ @dnn??II\\\ \78L8L7MNO Or0idscyrCr/rWrrs r1rjz,MistralCommonTokenizer.convert_ids_to_tokenssY\r0cyrCr/rs r1rjz,MistralCommonTokenizer.convert_ids_to_tokens sehr0c"t|trd}|g}nd}g}|D]T}|j|r|r|j|jj jj |V|r|gk(rtd|d|dS|S)a Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and added tokens. Args: ids (`int` or `List[int]`): The token id (or token ids) to convert to tokens. skip_special_tokens (`bool`, *optional*, defaults to `False`): Whether or not to remove special tokens in the decoding. Returns: `str` or `List[str]`: The decoded token(s). TFzInvalid token id .r)rDintrappendrNrO id_to_piecerI)rWrr one_tokentokensrs r1rjz,MistralCommonTokenizer.convert_ids_to_tokens s" c3 I%CI ]H%%h/4G MM$..;;EEQQRZ[ \ ] | #4SE!;<<!9  r0piecec|jtjk(r9|jjjj j |S|jtjk(rj|jjjj j|dt}t|dk(sJdt||dStd|j)NrU)allowed_specialdisallowed_specialz Expected to decode 1 token, got rr) rPr(r)rNrO_model piece_to_idr*rsetrTrI)rWrpiecess r1 _piece_to_idz#MistralCommonTokenizer._piece_to_id/s   #7#;#; ;>>44>>EEQQRWX X  ! !%9%@%@ @^^66@@GGNNuOFv;!# U'GF }%U U#!9 78L8L7MNO Or0rct|trd}|g}nd}g}|D]"}|j|j|$|r|dS|S)aT Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the vocabulary. Args: tokens (`str` or `List[str]`): One or several token(s) to convert to token id(s). Returns: `int` or `List[int]`: The token id or list of token ids. TFr)rDrErr)rWrrrrzs r1convert_tokens_to_idsz,MistralCommonTokenizer.convert_tokens_to_ids;s] fc "IXFI 1E JJt((/ 0 1 q6M r0cj|jjjj|||}|S)zW Converts a string into a sequence of tokens ids, using the tokenizer. )boseos)rNrOr)rWr~r tokens_idss r1 _text_to_idsz#MistralCommonTokenizer._text_to_idsUs;^^66@@GG (.@H r0c |r&tdt|jd|j|j |ddS)a Converts a string into a sequence of tokens, using the tokenizer. Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies. Args: text (`str`): The sequence to be encoded. **kwargs (additional keyword arguments): Not supported by `MistralCommonTokenizer.tokenize`. Will raise an error if used. Returns: `List[str]`: The list of tokens. rAz8 are not supported by `MistralCommonTokenizer.tokenize`.F)r)r)rIrJrKrjr)rWr~rXs r1tokenizezMistralCommonTokenizer.tokenize^sP wtFKKM':&;;stu u))$*;*;DUZ*;*[qv)wwr0rrrrrrc |r&tdt|jdfd}||}j||j|j||||| d| | | | |S)NrAz< are not supported by `MistralCommonTokenizer._encode_plus`.ct|trj|St|ttfr#t |dkDrt|dt r|Std|d)NrzInput z? is not valid. Should be a string, or a list/tuple of integers.rDrErrJrSrTrrIr~rrWs r1 get_input_idsz:MistralCommonTokenizer._encode_plus..get_input_idssd$$((/ABBD4-0SY]zRVWXRY[^G_  6$/n!oppr0T)rrrrrrr9rprepend_batch_axisrrrrr)rIrJrKprepare_for_modelvalue)rWr~rrrrrrr9rrrrrrrXrrs` ` r1rz#MistralCommonTokenizer._encode_plusss$ $v{{}-..jk  qD!%% 1$***00!1%)#"7&?'A'&  r0 batch_textreturn_offsets_mappingc fd}| r tdg}|D]}|j||j|||||||| | | || |}t|S)Nct|trj|St|ttfr#t |dkDrt|dt r|Std)NrzCInput is not valid. Should be a string or a list/tuple of integers.rrs r1rz@MistralCommonTokenizer._batch_encode_plus..get_input_idss[$$((/ABBD4-0SY]zRVWXRY[^G_  !fggr0zreturn_offset_mapping is not available when using Python tokenizers. To use this feature, change your tokenizer to one deriving from transformers.PreTrainedTokenizerFast.) rrrrrrr9rrrrrr)NotImplementedErrorr_batch_prepare_for_modelr)rWrrrrrrrr9rrrrrrrrXrr5r batch_outputss` ` r1_batch_encode_plusz)MistralCommonTokenizer._batch_encode_pluss, h "%8    1C   ]3/ 0 155 1- 3!1%"7&?'A')6 "]++r0c|jtjk(r;|jjjj Dchc]}|d c}S|jtj k(r.|jjjjStd|jcc}w)Nrankr) rPr(r*rNrO_all_special_tokensr)rrI)rWts r1_all_special_idsz'MistralCommonTokenizer._all_special_idss   #7#>#> >'+~~'H'H'R'R'f'fg!AfIg g  ! !%9%=%= =>>44>>NNP P78L8L7MNO O hs B< token_ids_0 token_ids_1already_has_special_tokensc| td|r td|j}|Dcgc] }||vrdnd }}|Scc}w)a Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods. Args: token_ids_0 (`List[int]`): List of ids of the sequence. token_ids_1 (`List[int]`, *optional*): Not supported by `MistralCommonTokenizer`. Kept to match the interface of `PreTrainedTokenizerBase`. already_has_special_tokens (`bool`, *optional*, defaults to `False`): Whether or not the token list is already formatted with special tokens for the model. Returns: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. zh`token_ids_1` is not supported by `MistralCommonTokenizer` and should be `None`, kept for compatibility.z``already_has_special_tokens` is not supported by `MistralCommonTokenizer` and should be `False`.rr)rIr)rWrrrall_special_idsrzspecial_tokens_masks r1get_special_tokens_maskz.MistralCommonTokenizer.get_special_tokens_masksk$  "z  &r //1Q\]E_$++F'GN8 $% K] r0c |J|durF|D|r@|jjddstjdd|jd<d}|durh|dur5|r"| | |dus|dk(rt j dt j}n?t|t s t |}n#t|t r|}nt j}|durr|p|durtj}n.t|ts t|}nt|tr|}|tjtjfvrtdtj}| t j k(rr|j"t$kDrS|r@|jjd dstjd d|jd <t j}n |j"}tjk7rr|j"t$kDrS|r@|jjd dstjd d|jd <tj}n |j"}t jk7r&|j&|j(d kr tdtjk7r1|t jk7r||||zd k7rtd|d|d||||fS)z? Find the correct padding/truncation strategy. Fz#Truncation-not-explicitly-activatedatTruncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.T longest_firstdo_not_truncatez`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.zaTruncation strategy `only_first` and `only_second` are not supported by `MistralCommonTokenizer`.zAsking-to-pad-to-max_lengthzAsking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.z Asking-to-truncate-to-max_lengthzAsking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.rzAsking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.zATruncation and padding are both activated but truncation length (z+) is not a multiple of pad_to_multiple_of (z).)rRgetloggerwarningwarningswarnrLONGESTrDrr LONGEST_FIRST ONLY_FIRST ONLY_SECONDrIr MAX_LENGTHr>r rrrh) rWrrrrrrXrrs r1rz9MistralCommonTokenizer._get_padding_truncation_strategiess  !g&6:;M00445Z\abNN)TX))*OP(J % $!-"*jE.AZSdEd P$3#:#: 9#27#; G_5#* .99  U "z'=T!&44$ ,>?&8&D#J(:;&0#0;;=O=[=[\\ w#5"D"D   ?#=#==((=8#88<<=Z\ab"NN!UTX112OP'6'A'A$!%!6!6J"&8&H&HH((=8#88<<=_afg"NN![Y]112TU*<*L*L'!%!6!6J 99 9t~~?UY]YjYjmnYne  #5#E#E E O$>$>>".&00A5&&0\1\]o\pprt   !4j&HHr0rc|d|jv}||jd}|tjk(r t|}||||zdk7r ||zdz|z}|tjk7xrt||k7}|rd|vrdgt|z|d<|r|t|z } ||n |j }|dk(rI|r|ddg| zz|d<d|vr|ddg| zz|d<||j g| zz||jd<|S|dk(rI|rdg| z|dz|d<d|vrdg| z|dz|d<|j g| z|z||jd<|Std||S)a) Pad encoded inputs (on left/right and up to predefined length or max length in the batch) Args: encoded_inputs: Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). max_length: maximum length of the returned list and optionally padding length (see below). Will truncate by taking into account the special tokens. padding_strategy: PaddingStrategy to use for padding. - PaddingStrategy.LONGEST Pad to the longest sequence in the batch - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) - PaddingStrategy.DO_NOT_PAD: Do not pad The tokenizer padding sides are defined in `padding_side` argument: - 'left': pads on the left of the sequences - 'right': pads on the right of the sequences pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability `>= 7.5` (Volta). padding_side: The side on which the model should have padding applied. Should be selected between ['right', 'left']. Default value is picked from the class attribute of the same name. return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics) r6rrr:rr8zInvalid padding strategy:)r7rrrTrr9rhrI) rWrrrrr9rrequired_inputneeds_to_be_padded differences r1_padzMistralCommonTokenizer._pad sH ! ($48N8N$N !'(>(>q(AB 66 6^,J  !&8&D*WiJimnJn%);;q@DVVJ-1K1KKqPSTbPcgqPq !%5^%K01sS5H/HN+ , #c.&99J+7+C<IZIZLw&(7EFV7W[\Z]`jZj7jN#34(N:C* If the `encoded_inputs` passed are dictionary of numpy arrays, PyTorch tensors, the result will use the same type unless you provide a different tensor type with `return_tensors`. In the case of PyTorch tensors, you will lose the specific device of your tensors however. Args: encoded_inputs ([`BatchEncoding`], list of [`BatchEncoding`], `Dict[str, List[int]]`, `Dict[str, List[List[int]]` or `List[Dict[str, List[int]]]`): Tokenized inputs. Can represent one input ([`BatchEncoding`] or `Dict[str, List[int]]`) or a batch of tokenized inputs (list of [`BatchEncoding`], *Dict[str, List[List[int]]]* or *List[Dict[str, List[int]]]*) so you can use this method during preprocessing as well as in a PyTorch Dataloader collate function. Instead of `List[int]` you can have tensors (numpy arrays, PyTorch tensors), see the note above for the return type. padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`): Select a strategy to pad the returned sequences (according to the model's padding side and padding index) among: - `True` or `'longest'` (default): Pad to the longest sequence in the batch (or no padding if only a single sequence if provided). - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum acceptable input length for the model if that argument is not provided. - `False` or `'do_not_pad'`: No padding (i.e., can output a batch with sequences of different lengths). max_length (`int`, *optional*): Maximum length of the returned list and optionally padding length (see above). pad_to_multiple_of (`int`, *optional*): If set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta). padding_side (`str`, *optional*): The side on which the model should have padding applied. Should be selected between ['right', 'left']. Default value is picked from the class attribute of the same name. return_attention_mask (`bool`, *optional*): Whether to return the attention mask. If left to the default, will return the attention mask according to the specific tokenizer's default, defined by the `return_outputs` attribute. [What are attention masks?](../glossary#attention-mask) return_tensors (`str` or [`~utils.TensorType`], *optional*): If set, will return tensors instead of list of python integers. Acceptable values are: - `'pt'`: Return PyTorch `torch.Tensor` objects. - `'np'`: Return Numpy `np.ndarray` objects. verbose (`bool`, *optional*, defaults to `True`): Whether or not to print more information and warnings. rzRYou should supply an encoding or a list of encodings to this method that includes z, but you provided r6ptnpztype of z unknown: zA. Should be one of a python, numpy, pytorch or tensorflow object.)rrr)rrrr9rrc3:K|]}t|k(ywrCrT)rFv batch_sizes r1rHz-MistralCommonTokenizer.pad..sIA3q6Z'IszLSome items in the output dictionary have a different batch size than others.c32K|]}t|ywrCr )rFinputss r1rHz-MistralCommonTokenizer.pad..sFVS[Fs)rDrJrSrr7rIrKrrTrrrndarraytyperrrrrrUvaluesrrmaxrranger)rWrrrrr9rrrrexampler first_elementitemrrrrrGkr rrr s @r1rzMistralCommonTokenizer.padjsvV ntUm 4NSTDUW^9_[ijk[lmTWc#OWGCL#OOmNm  ! !! $N :!!%!7!7!: ;;NtTbTgTgTiOjNkm  ((>(>q(AB  !j&GCP^L_cdLd$35/0! ! 'q) mdE] 3& t9>$(GM  -#tU);<}-)7)?^M2::6)7)?^ }oZ]8K7LMVV -224 7 U&/&6s# 7.2-T-T G.U. *!Z((>(>q(AB *^A->u "N!YY%!1#5)&; 'N!^L L( I1F1F1HII Z I 66 6F~FFJ.99  z" 1A*8*>*>*@A$!Qa1gAFAii%!1#5)&;  G&mmo 1 Um+)+M#&c"))%0 1 1 ]GGk$PmLBs L K< LL<Lrc |r&tdt|jd|r td|dkr|dgfSt|ts t |}|tj tj fvr.tdtjdtjdg}|tjk(rt||kDrgtt|||z}|jd k(r |d|}||d}n\|jd k(r || d}|d| }n@td |jd d |dt|d} tj| |d|fS)ap Truncates a sequence pair in-place following the strategy. Args: ids (`List[int]`): Tokenized input ids. Can be obtained from a string by chaining the `tokenize` and `convert_tokens_to_ids` methods. pair_ids (`None`, *optional*): Not supported by `MistralCommonTokenizer`. Kept to match the signature of `PreTrainedTokenizerBase.truncate_sequences`. num_tokens_to_remove (`int`, *optional*, defaults to 0): Number of tokens to remove using the truncation strategy. truncation_strategy (`str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `'longest_first'`): The strategy to follow for truncation. Can be: - `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or to the maximum acceptable input length for the model if that argument is not provided. - `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths greater than the model maximum admissible input size). stride (`int`, *optional*, defaults to 0): If set to a positive number, the overflowing tokens returned will contain some tokens from the main sequence returned. The value of this argument defines the number of additional tokens. Returns: `Tuple[List[int], None, List[int]]`: The truncated `ids` and the list of overflowing tokens. `None` is returned to match Transformers signature. rAzB are not supported by `MistralCommonTokenizer.truncate_sequences`.zK`pair_ids` is not supported by `MistralCommonTokenizer.truncate_sequences`.rNzOnly z and z are supported.r8r:zinvalid truncation strategy: z, use 'left' or 'right'.zWe need to remove z; to truncate the input but the first sequence has a length z. )rIrJrKrDrrrrrrTminr;rerror) rWrrrrrrXr window_len error_msgs r1rz)MistralCommonTokenizer.truncate_sequences sF $v{{}-..pq  jk k 1 $r? "-/AB"45H"I  #5#@#@BTB`B`"a a*889?Q?a?a>bbqr   "4"B"B B3x.. S64H+HI ''61),[j)9&234C))W4),j[\):&4 445C$'DTEYEYDZZr%stt))=(>?;;>s8*BH Y'T-..r0 conversationtoolscontinue_final_messager return_dictc  | r&tdt| jdt|ts tdt|tt fr-t|dtt fst |ddr|} d} n|g} d} dtttfd d fd } g}g}g}| D]}g}|D]}| ||j|tj||| }|jj|}|r|j|jn|j|j |j#|j$|j#|j&Dcgc]}|j(c}| s|d}|r|||||d| }| r|rm|dk(r+t+s t-dt/j0|}n.|dk(rt3j4|}n||}ntd|||j6d<|r| t9d||j6d<|S|dSt:j=d|Scc}w)aj Converts a list of dictionaries with `"role"` and `"content"` keys to a list of token ids. Args: conversation (Union[List[Dict[str, str]], List[List[Dict[str, str]]]]): A list of dicts with "role" and "content" keys, representing the chat history so far. tools (`List[Union[Dict, Callable]]`, *optional*): A list of tools (callable functions) that will be accessible to the model. If the template does not support function calling, this argument will have no effect. Each tool should be passed as a JSON Schema, giving the name, description and argument types for the tool. See our [chat templating guide](https://huggingface.co/docs/transformers/main/en/chat_templating#automated-function-conversion-for-tool-use) for more information. continue_final_message (bool, *optional*): If this is set, the chat will be formatted so that the final message in the chat is open-ended, without any EOS tokens. The model will continue this message rather than starting a new one. This allows you to "prefill" part of the model's response for it. Cannot be used at the same time as `add_generation_prompt`. tokenize (`bool`, defaults to `True`): Whether to tokenize the output. If `False`, the output will be a string. padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`): Select a strategy to pad the returned sequences (according to the model's padding side and padding index) among: - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single sequence if provided). - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum acceptable input length for the model if that argument is not provided. - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different lengths). truncation (`bool`, defaults to `False`): Whether to truncate sequences at the maximum length. Has no effect if tokenize is `False`. max_length (`int`, *optional*): Maximum length (in tokens) to use for padding or truncation. Has no effect if tokenize is `False`. If not specified, the tokenizer's `max_length` attribute will be used as a default. return_tensors (`str` or [`~utils.TensorType`], *optional*): If set, will return tensors of a particular framework. Has no effect if tokenize is `False`. Acceptable values are: - `'pt'`: Return PyTorch `torch.Tensor` objects. return_dict (`bool`, defaults to `False`): Whether to return a dictionary with named outputs. Has no effect if tokenize is `False`. If at least one conversation contains an image, its pixel values will be returned in the `pixel_values` key. kwargs (additional keyword arguments, *optional*): Not supported by `MistralCommonTokenizer.apply_chat_template`. Will raise an error if used. Returns: `Union[str, List[int], List[str], List[List[int]], BatchEncoding]`: A list of token ids representing the tokenized chat so far, including control tokens. This output is ready to pass to the model, either directly or via methods like `generate()`. rAzC are not supported by `MistralCommonTokenizer.apply_chat_template`.z@`truncation` must be a boolean for `apply_chat_template` method.rmessagesTFmessagerZNcZt|tsy|jd}|rt|tryg}|D]i}|jdd}|s|dk(r|jd}|jd}|jd}|r|}nc|r;|j ds't |j j}|}n&|r|j d sd |z}|}n td |jd d|id |dk(r|jd}|jd}|jd}|s|r(t|xs|dd} |jd| d4|s td|jdd|idY|j|l||d<y)zRAdapt message to `mistral-common` format and leave validation to `mistral-common`.Ncontentrimageurlpathbase64zfile://z data:imagezdata:image/unk;base64,z Image content must be specified. image_url)rr*audiodictT) return_format force_mono input_audio)rr/z Audio content must be specified. audio_url)rr0) rDr,rrE startswithrresolveas_urirIrr ) r#maybe_list_contentnormalized_contentr% content_type maybe_url maybe_path maybe_base64 image_content audio_datas r1_maybe_adapt_messagezHMistralCommonTokenizer.apply_chat_template.._maybe_adapt_messagesgt,dkdodoe &4F)LNP -! 7&{{648 #!W,/6{{5/AI07 F0CJ29++h2GL (1 #)44Y?)-j)9)A)A)C)J)J)LJ(2 %+66|D+Cl+RL(4 ()KLL&--{RWYfQg.hi!W,/6{{5/AI07 F0CJ29++h2GL J%293J Z`mq%r *11=Yc2de '()KLL&--{RWYeQf.gh&--g6C! 7D"4GI r0)r"rr)rrrrrrzMUnable to convert output to PyTorch tensors format, PyTorch is not installed.rz!Unsupported return_tensors type: pixel_valueszWhen passing audio content in apply_chat_template, `return_tensors` must be None since we cannot batch the audio inputs. The returned audio will be a list of numpy arrays.r+r5z`MistralCommonTokenizer.apply_chat_template(..., tokenize=False)` is unsafe and may lead to unexpected behavior. Please consider using `tokenize=True` instead and don't encode the output manually.)rIrJrKrDboolrShasattrr,rErrr from_openairNencode_chat_completionrr~extendimagesaudios audio_arrayr ImportErrortorchtensorrarraydatarrr)rWrrrrrrrrr rX conversations is_batchedr<rrCrDr"r# chat_requesttokenized_requesteloutr=s r1apply_chat_templatez*MistralCommonTokenizer.apply_chat_templateXs~ $v{{}-..qr *d+_` ` lT5M 2 |Au 6',q/S]:^(MJ)NMJ- 4$sCx.- 4T- 4^#%#%) OLacH' )$W-( )1<<!'=L !% E El S 07780556 MM+22 3 MM4E4L4LMb2>>M N% O(ajG %%#(- C%-13"- o#(-||F'; '4/')xx'7 '/'- (+L^L\)]^^/;CHH^,%11J)/CHHW% ;'' NNg N_Ns2I text_targettext_pair_targetc |r&tdt|jd|s|s|r td| dvr tdd}||s tdt|ttfxrt|dt ttf}|j d |||| |d |\}}}}|r!|jd |||||| | | | | ||||d |S|jd |||||| | | | | ||||d |S) a~ Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of sequences. Args: text (`str`, `List[str]`, `List[List[str]]`, *optional*): The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of int (encoded strings). text_pair (`None`, *optional*): Not supported by `MistralCommonTokenizer`. Kept to match the signature of `PreTrainedTokenizerBase.__call__`. text_target (`None`, *optional*): Not supported by `MistralCommonTokenizer`. Kept to match the signature of `PreTrainedTokenizerBase.__call__`. text_pair_target (`None`, *optional*): Not supported by `MistralCommonTokenizer`. Kept to match the signature of `PreTrainedTokenizerBase.__call__`. rAz8 are not supported by `MistralCommonTokenizer.__call__`.z``text_pair`, `text_target` and `text_pair_target` are not supported by `MistralCommonTokenizer`.)tfjaxzZ`MistralCommonTokenizer` does not support `return_tensors='tf'` or `return_tensors='jax'`.c6t|tryt|ttfrrt |dk(ryt|dtt fryt|dttfr/t |ddk(xst|ddtt fSyy)NTrF)rDrErJrSrTr)rs r1_is_valid_text_inputz=MistralCommonTokenizer.__call__.._is_valid_text_inputOs!S!Ae}-q6Q;!sCj1!tUm4qt9>LZ!Q#s-LL r0ztext input must be of type `str` (single example), `List[str]` (batch or single encoded example) or `List[List[int]]` (batch of encoded examples).rr)rrrrrrrr9rrrrrr)r~rrrrrrr9rrrrrrr/) rIrJrKrDrSrErrr)rWr~rrRrSrrrrrrr9rrrrrrrXrXrLrrs r1__call__zMistralCommonTokenizer.__call__sJ wtFKKM':&;;stu u  '7r  ] *l  ($D)D   tUm4`DGcSWY^M_9` DkDDkDkE !!1 E  E A-z6 *4**#5!1$7%#5)-&;*C+E+ $%4$$#5!1$7%#5)-&;*C+E+ r0main) r= cache_dirforce_downloadlocal_files_onlyrzrevisionr>r9r;r7r?pretrained_model_name_or_pathr[r\r]rzr^c | r td|rPt|jjddhs&tdt |jdt j j|st||||||}n?g}t tj}t tjdgz}|Dcgc]}|D] }d|| c}}d gz}t j|D]O}t|}|j}dj|j }|d k(s||vs?|j#|Qt%|d k(rtd |t%|d kDr/d |vrd }n t'|}t(j+d|d|dn|d }t j j||}||||| | | | Scc}}w)a Instantiate a `MistralCommonTokenizer` from a predefined tokenizer. Args: pretrained_model_name_or_path (`str` or `os.PathLike`): Can be either: - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co. - A path to a *directory* containing the tokenizer config, for instance saved using the [`MistralCommonTokenizer.tokenization_mistral_common.save_pretrained`] method, e.g., `./my_model_directory/`. mode (`ValidationMode`, *optional*, defaults to `ValidationMode.test`): Validation mode for the `MistralTokenizer` tokenizer. cache_dir (`str` or `os.PathLike`, *optional*): Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the standard cache should not be used. force_download (`bool`, *optional*, defaults to `False`): Whether or not to force the (re-)download the vocabulary files and override the cached versions if they exist. token (`str` or *bool*, *optional*): The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated when running `hf auth login` (stored in `~/.huggingface`). local_files_only (`bool`, *optional*, defaults to `False`): Whether or not to only rely on local files and not to attempt to download any files. revision (`str`, *optional*, defaults to `"main"`): The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any identifier allowed by git. max_length (`int`, *optional*): Controls the maximum length to use by one of the truncation/padding parameters. If left unset or set to `None`, this will use the predefined model maximum length if a maximum length is required by one of the truncation/padding parameters. If the model has no specific maximum input length (like XLNet) truncation/padding to a maximum length will be deactivated. padding_side (`str`, *optional*, defaults to `"left"`): The side on which the model should have padding applied. Should be selected between ['right', 'left']. Default value is picked from the class attribute of the same name. truncation_side (`str`, *optional*, defaults to `"right"`): The side on which the model should have truncation applied. Should be selected between ['right', 'left']. model_input_names (`List[string]`, *optional*): The list of inputs accepted by the forward pass of the model (like `"token_type_ids"` or `"attention_mask"`). Default value is picked from the class attribute of the same name. clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`): Whether or not the model should cleanup the spaces that were added when splitting the input text during the tokenization process. kwargs (additional keyword arguments, *optional*): Not supported by `MistralCommonTokenizer.from_pretrained`. Will raise an error if used. zL`init_inputs` are not supported by `MistralCommonTokenizer.from_pretrained`. _from_autotrust_remote_coderAz? are not supported by `MistralCommonTokenizer.from_pretrained`.)repo_idr[rzr^r\r]z.model.z.modelz tekken.jsonrz&No tokenizer file found in directory: rz-Multiple tokenizer files found in directory: z. Using r)r<r=r>r9r;r7r?)rIrrKissubsetrJosr(isdirr&r" __members__r#listdirrnamejoinsuffixesrrTrrr)clsr_r=r[r\r]rzr^r>r9r;r7r? init_inputsrXr<valid_tokenizer_filesinstruct_versions mm_versionsr msentencepiece_suffixesr(pathlib_repo_file file_namesuffixtokenizer_files r1from_pretrainedz&MistralCommonTokenizer.from_pretrainedsH kl l #fkkm,55|EX6YZ$v{{}-..mn ww}}:;;5#!-!1 N%' !!%%5%A%A B 0<<=DK@Q%g1[f%gVWs1#&6%g&6%gksjt%t " #@A <$(J!-22 !2!;!;< -;Q1Q)00;  <()Q. #IJgIh!ijj()A- $99%2N%()>%?NCDaCbbjkyjzz{|"7q!9WW\\*GXN)-%+/)E  3&hsG save_directory push_to_hubcommit_messagercprivaterepo_url organization.c  (| jdd| r&tdt| jdt |}|j ddt j|j||rv|xs5t|jtjjd}|j|||||}|j|} |j!||| || t||jj"z fS) a^ Save the full tokenizer state. This method make sure the full tokenizer can then be re-loaded using the [`~MistralCommonTokenizer.tokenization_mistral_common.from_pretrained`] class method. Args: save_directory (`str` or `os.PathLike`): The path to a directory where the tokenizer will be saved. push_to_hub (`bool`, *optional*, defaults to `False`): Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the repository you want to push to with `repo_id` (will default to the name of `save_directory` in your namespace). token (`str` or *bool*, *optional*, defaults to `None`): The token to use to push to the model hub. If `True`, will use the token in the `HF_TOKEN` environment variable. commit_message (`str`, *optional*): The commit message to use when pushing to the hub. repo_id (`str`, *optional*): The name of the repository to which push to the Hub. private (`bool`, *optional*): Whether the model repository is private or not. repo_url (`str`, *optional*): The URL to the Git repository to which push to the Hub. organization (`str`, *optional*): The name of the organization in which you would like to push your model. kwargs (`Dict[str, Any]`, *optional*): Not supported by `MistralCommonTokenizer.save_pretrained`. Will raise an error if used. Returns: A tuple of `str`: The files saved. save_jinja_filesNrAz? are not supported by `MistralCommonTokenizer.save_pretrained`.T)parentsexist_ok)rzr|r}r~)r{rz)poprIrJrKrmkdirshutilcopyrLrEsplitrfr(sep _create_repo_get_files_timestamps_upload_modified_filesrj) rWryrzrzr{rcr|r}r~rXfiles_timestampss r1save_pretrainedz&MistralCommonTokenizer.save_pretrainedsR  %t, $v{{}-..mn n-TD9 D((.9 K^!4!:!:277;;!G!KG''ugWc(G $99.I   ' ' - ( NT%9%9%>%>>?AAr0) NTFNNrNNNT)FN)F)NF)NTFNNrNNNNFFFTF)FNNNT)TNNNNNT)Nrrr)NFTFFNNF)NNNNTFNNrNNNNFFFT)FNNNNNN)Lr+r,r-r.r7rJrE__annotations__r9r;r testrr rfPathLikerrr r>rYpropertyr^rbrerhrkrnrprrrur,r{r}rENCODE_KWARGS_DOCSTRINGrrrrrrrrrrrr rjrrrrrrrrrrrrrr'ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRINGrrrrrSrrrQrY classmethodrxrr/r0r1r4r4s+Z%01A#BtCyBL#"OS" .22 2"&15-2F?c2;;45F?F? F?  F?  F?$DI.F?'+F?PBcBB BcBB BcBB BcBB =3== =3== =3== =3== CCCC%4S>%    #'5:AE$(,0&*;?6+I|+,6+6+! 6+ tS/12 6+ $%7=> 6+SM6+6+%SM6+sm6+!sJ!786+6+ c6+ 6+v%*7; (d3i^CD("('/tn ( (Z%*7; ! cDcORZZOP! "! '/tn ! c ! FP#P$P\\4\TW\\ hchhZ^_bZchhFK"d3i("?C" sDI~ "H P# P# PE#tCy.,AeCQUVYQZNF[4cxYxT#Yx0$(,;,F,F2D2T2T$(,0&*;?04*/+0#1 I|+,1 !1 * 1 0 1 SM 1 1 %SM1 sm1 !sJ!781  (~1 $(1 %)1 1 1 " #1 r$(,;,F,F2D2T2T$(,0&*;?04*/+0',#':, O    :, ! :,*:,0:,SM:,:,%SM:,sm:,!sJ!78:, (~:,$(:, %)!:,"!%#:,$%:,&':,* +:,xP#c(P_d##.2#W[# c#F$(,;,F,F2D2T2T$(,0&*(,04*/+0#>/c:;<>!>* > 0 > SM >>%SM>sm>! > (~>$(>%)>>> !>@/1XY#'5:AE$(,0&*;?04*/+0##(#` #Y``! ` tS/12 ` $%7=> `SM``%SM`sm`!sJ!78` (~`$(`%)`` !`"!#`& '`Z`H6;EI$(,0 qIsOT12qIU3(:D#@ABqISM qI %SM qI  qIl%),;,F,F,0&*04Hd3 #45}DEHSMH* H %SM H sm H (~H Hf6:$(,0&*04;?aH    l" # d<(( ) c<'( )  + aHtS/12aHSMaH%SMaHsmaH (~aH!sJ!78aHaH !aHL$%>M I/ #YI/I/" I/ #3(:#:; I/  I/ tCy$S ) *I/\8<',5: $(;?!CDc3h0$tDcN7K2LLMCU4>234C!% C  C tS/12 CCSMC!sJ!78CC sDItCy$tCy/=H ICJ/1XY[_ !%#'5:AE$(,0&*;?04*/+0#%yI|T)_d<>PRVVWyy y  y ! ytS/12y$%7=>ySMyy%SMysmy!sJ!78y (~y$(y %)!y"#y$%y( )yZyv .227;$!&,0 2"&15-2| ',S"++-='>|  | E#r{{"234 |  | | c4i()| | | | | $DI.| '+| | B",0(,!%"&"&&*CBc2;;45CBCBc4i() CB ! CB # CB$CB3-CBsmCB sCxCBr0r4)Crfrrcollections.abcrrenumrpathlibrtypingrrr r r numpyrtransformers.audio_utilsr $transformers.tokenization_utils_baser rrrrrrrtransformers.utilsrrrrrtransformers.utils.genericrtransformers.utils.hubrtransformers.utils.import_utilsrrr(mistral_common.protocol.instruct.requestr*mistral_common.protocol.instruct.validatorr %mistral_common.tokens.tokenizers.baser!r"&mistral_common.tokens.tokenizers.imager#(mistral_common.tokens.tokenizers.mistralr$'mistral_common.tokens.tokenizers.tekkenr%&mistral_common.tokens.tokenizers.utilsr&rG get_loggerr+rrrrEr(r4r/r0r1rs *;;2   cb61ee NIZHIBU   H %,\*+'Z3 &'}B^}B(}Br0