L iZdZddlZddlZddlZddlZddlmZddlmZm Z m Z m Z ddl m Z mZmZmZmZmZmZmZmZmZmZmZmZddlmZmZmZmZej>e Z!dZ"d Z#d Z$Gd d Z%Gd de%Z&dZ'dZ(dZ)dZ*dZ+de,e-de-fdZ.eeGddeZ/y)z Tokenization classes for python tokenizers. For fast tokenizers (provided by HuggingFace's tokenizers library) see tokenization_utils_fast.py N) OrderedDict)AnyOptionalUnionoverload) ENCODE_KWARGS_DOCSTRING'ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRINGINIT_TOKENIZER_DOCSTRING AddedToken BatchEncoding EncodedInputEncodedInputPairPreTokenizedInputPreTokenizedInputPairPreTrainedTokenizerBase TextInput TextInputPairTruncationStrategy)PaddingStrategy TensorTypeadd_end_docstringsloggingzspecial_tokens_map.jsonzadded_tokens.jsonztokenizer_config.jsoncDeZdZdZdZdZdefdZdedeefdZ d Z y ) Triez Trie in Python. Creates a Trie out of a list of words. The trie is used to split on `added_tokens` in one pass Loose reference https://en.wikipedia.org/wiki/Trie c\i|_t|_d|_|j|y)N)dataset_tokens_termination_charupdate)selfargss e/mnt/ssd/data/python-lab/Trading/venv/lib/python3.12/site-packages/transformers/tokenization_utils.py__init__z Trie.__init__:s( u !# Tc@t|D]}|j|y)z Updates the Trie with new tokens provided as arguments. Args: *args: Variable number of words to be added to the Trie. N)tupleadd)r#r$tokens r%r"z Trie.update@s"D\ E HHUO r'wordc|sy|jj||j}|D]}|j|i||<||}d||j<y)u Passes over every char (utf-8 char) on word and recursively adds it to the internal `data` trie representation. The special key `""` in `self._termination_char` is used to represent termination. This function is idempotent, adding twice the same word will leave the trie unchanged Example: ```python >>> trie = Trie() >>> trie.add("Hello 友達") >>> trie.data {"H": {"e": {"l": {"l": {"o": {" ": {"友": {"達": {"": 1}}}}}}}}} >>> trie.add("Hello") >>> trie.data {"H": {"e": {"l": {"l": {"o": {"": 1, " ": {"友": {"達": {"": 1}}}}}}}}} ``` Nr)r r*r setdefaultr!)r#r,refchars r%r*zTrie.addJsc(  ii DtR0CId)C '(D " "#r'textreturncht}dg}d}t|D]8\}}|r||krt}d}|jD]\} } d| vr|jD]q\} } | | kDrng| | kr |dz} |dz}n|} |}| t |kr|| nd}d| vr| } | }| }|| vsE| |} | dz } d| vr| } | }| }| t |k(rh|| }|| vr-s|j | |j d}n"|| vr | |} | || <|j | |ri}n |D]} || =||k\s||jvs'|j|||<;|jD]8\} } d| vs t |}|j | |j |n|j||S)aY Will look for the words added to the trie within `text`. Output is the original string split along the boundaries of the words found. This trie will match the longest possible word first ! Example: ```python >>> trie = Trie() >>> trie.split("[CLS] This is a extra_id_100") ["[CLS] This is a extra_id_100"] >>> trie.add("[CLS]") >>> trie.add("extra_id_1") >>> trie.add("extra_id_100") >>> trie.split("[CLS] This is a extra_id_100") ["[CLS]", " This is a ", "extra_id_100"] ``` rFrrNT) r enumerateritemslenappendr*rcut_text)r#r1statesoffsetsskipcurrent current_char to_removeresetstart trie_pointer lookstartlooktrie_pointerlookahead_indexend next_chars r%splitz Trie.splitisLB # %.t_\ : !G\$ IE(.||~? )#|%8>||~ >3 #3$u,!&./6kO")A+C /6O")C=LsSWy=XD$9^b !11$-E"1C#2D'+;;/? /J,+q0O!%55(1&5'6.#d); %(,_(=I(+;;+ >HNN5)NN3' E!\1$0 #=L%1F5M MM%(? )F&&Eu & $<499#<"&))L"9wy\ :~$*<<>  E<\!$iu%s# }}T7++r'c|jt|g}d}|D]9}||kDrtjd||k(r$|j||||};|S)NrzbThere was a bug in Trie algorithm in tokenization. Attempting to recover. Please report it anyway.)r7r6loggererror)r#r1r:tokensr@rEs r%r8z Trie.cut_textst s4y! Cs{ # MM$uS/ *E  r'N) __name__ __module__ __qualname____doc__r&r"strr*listrGr8r'r%rr4s=  ((>W,#W,$s)W,rr'rcJeZdZfdZdefdZdedefdZdedefdZ xZ S) ExtensionsTriect||yN)superr&)r#r$ __class__s r%r&zExtensionsTrie.__init__s $r'prefixcv|j|}|j|}|Dcgc]}||z c}Scc}w)aC Generates all extensions of a given prefix token in the Trie. Example: ```python >>> trie = Trie() >>> trie.add("apple") >>> trie.add("app") >>> trie.add("application") >>> trie.extensions("app") ['app', 'apple', 'application'] ``` ) _get_node_collect_tokens)r#rY prefix_noderetr+s r% extensionszExtensionsTrie.extensionss:nnV, "";/,/05000s 6r+r2cD|j}|D]}||vr|S||}|S)a Retrieves the node corresponding to the given token in the Trie. Args: token (str): The token for which the corresponding node needs to be retrieved. Returns: dict: The node in the Trie corresponding to the given token. )r)r#r+noder0s r%r[zExtensionsTrie._get_node2s@yy D4 :D    r'rac|j|vr |jgng}|jD]H\}}||jk7s|j|}|j|Dcgc]}||z c}J|Scc}w)a Generates all tokens in the Trie starting from a given node. Args: node (dict): The node in the Trie from which tokens need to be generated. Returns: list: List of tokens generated from the given node. )r!r5r\extend)r#rarKr+ subtrie_head subtokenssubtokens r%r\zExtensionsTrie._collect_tokensDs.2-C-Ct-K$(()QS#'::< L E<... 00>  JHux/JK L Ks$ A; ) rLrMrNr&rPr_dictr[rQr\ __classcell__rXs@r%rTrTs: 11&st$DTr'rTcd|dk(s|dk(s |dk(s|dk(rytj|}|dk(ryy)z0Checks whether `char` is a whitespace character.    TZsF) unicodedatacategoryr0cats r%_is_whitespacertVs= s{ddlddlddl   t $C d{ r'cr|dk(s |dk(s|dk(rytj|}|jdryy)z-Checks whether `char` is a control character.rlrmrnFCT)rprq startswithrrs r% _is_controlrxbs< t|tt|tt|   t $C ~~c r'ct|}|dk\r|dks|dk\r|dks|dk\r|dks |dk\r|dkry tj|}|jd ry y ) z1Checks whether `char` is a punctuation character.!/:@[`{~TPF)ordrprqrw)r0cprss r%_is_punctuationrnsj TB bR2X28bbBh2QS8Y[_bYbgimpgp   t $C ~~c r'cd|d}tt|t|zt|zS)zcChecks whether the last character in text is one of a punctuation, control or whitespace character.boolrxrrt)r1 last_chars r%_is_end_of_wordr}s0RI  I&)CCnU^F__ ``r'cd|d}tt|t|zt|zS)zdChecks whether the first character in text is one of a punctuation, control or whitespace character.rr)r1 first_chars r%_is_start_of_wordrs0aJ  J'/**EEWaHbb ccr' token_list new_tokenctj||}|t|kr |||k(ry|j||y)zm Inserts one token to an ordered list if it does not already exist. Note: token_list must be sorted. N)bisect bisect_leftr6insert)rr insertion_idxs r%!_insert_one_token_to_ordered_listrsA&&z9=Ms:&:m+D +Q-3r'c'eZdZdZfdZedefdZedefdZ ede e effdZ ede ee ffdZejde eee e ffde ee ffd Zde e effd Zd Zd ZdGdeee ee fdedefdZdHdeee fdZdGdedefdZdedee fdZdZdee ee fdeeeeffdZdZdZddej@e!jDddd dddddd d d d dfdeee#e$fdeeee#e$fd ed!ed"e!d#eed$ed%ed&eed'ee d(eee e%fd)eed*eed+ed,ed-ed.ed/ede&f&d0Z'dej@e!jDddd dddddd d d d dd fd1eeeee(ee#ee)ee$ee*fd ed!ed"e!d#eed$ed%ed&eed'ee d(eee e%fd)eed*eed+ed,ed-ed.ed/ed2ede&f&d3Z+e,e-e.dej@e!jDdddddddd d d dd fd4eee)e/eedffd ed!ed"e!d#eed$ed&eed'ee d(ee d)eed*eed+ed,ed.ed/ed2ede&f"d5Z0 dGde d%ede/e e e e1fffd6Z2 dId7ed8eed9edeeffd: Z3e4dGd;edZ5 dGd;eeeefdz:PreTrainedTokenizer.added_tokens_encoder..s eijkelr'key)sortedrr5r)r#rrs r%added_tokens_encoderz(PreTrainedTokenizer.added_tokens_encoders; *00J0J0P0P0RXl)mnA 1 nnnsAc`tt|jjdS)z Returns the added tokens in the vocabulary as a dictionary of index to AddedToken. Returns: `dict[str, int]`: The added tokens. c |dSrrRrs r%rz:PreTrainedTokenizer.added_tokens_decoder..s PTUVPWr'r)rgrrr5rs r%rz(PreTrainedTokenizer.added_tokens_decoders&F455;;=CWXYYr'valuec |jD]\}}t|ttfrt|ts>t d|j |j fdttttfft|tr t|n||j|<||jt|<|jy)Nz;The provided `added_tokens_decoder` has an element of type z, should be a dict of ) r5 isinstancerPr int TypeErrorrXrrr_update_total_vocab_size)r#rindexr+s r%rz(PreTrainedTokenizer.added_tokens_decoders"KKM ;LE5ec:%67z%QT?UQRWRaRachcrcrRrQstJKNPUV`beVePfKfJghFPPUWZE[ 50AafD & &u -5:D & &s5z 2 ; %%'r'c|jS)aX Returns the added tokens in the vocabulary as a dictionary of token to index. Results might be different from the fast call because for now we always add the tokens even if they are already in the vocabulary. This is something we should change. Returns: `dict[str, int]`: The added tokens. )rrs r%get_added_vocabz#PreTrainedTokenizer.get_added_vocabs)))r'c|jS)zD Size of the full vocabulary with the added tokens. )total_vocab_sizers r%__len__zPreTrainedTokenizer.__len__s$$$r'c@t|j|_y)a! Update the size of the full vocabulary with the added tokens. Counts the `keys` and not the `values` because otherwise if there is a hole in the vocab, we will add tokenizers at a wrong index. This operation is slow and is only updated when adding tokens. N)r6 get_vocabrrs r%rz,PreTrainedTokenizer._update_total_vocab_sizes !$DNN$4 5r'F new_tokensrc 0d}||S|jj}t|}|D]}t|tt fst d|dt|dt |dk(rDt|tr3||jvrc||jvxs|}t |dd| |}n |r|jd|jd ||jvr|js8|jr,t|d dr|jj!|_|j|vr||z}|||j<|d z }n||j}|jr5t ||jvr|j"d j%|||j|<||j|j<|j&st(j+d |d|j-|j/|S)a Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to it with indices starting from length of the current vocabulary. Special tokens are sometimes already in the vocab which is why they have to be handled specifically. Args: new_tokens (`list[str]`or `list[tokenizers.AddedToken]`): Token(s) to add in vocabulary. A token is counted as added if it's not already in the vocabulary (tested by checking if the tokenizer assign the index of the `unk_token` to them). If a token is part of the vocabulary then we simply mark this token as an `AddedToken` which allows to control the stripping and normalization of this token. This is NOT possible in `tokenizers`. special_tokens (`bool`, *optional*, defaults to `False`): Whether or not the tokens should be added as special tokens. Returns: `int`: The number of tokens actually added to the vocabulary. Examples: ```python # Let's see how to increase the vocabulary of Bert model and tokenizer tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased") model = BertModel.from_pretrained("google-bert/bert-base-uncased") num_added_toks = tokenizer.add_tokens(["new_tok1", "my_new-tok2"]) print("We have added", num_added_toks, "tokens") # Note: resize_token_embeddings expects to receive the full size of the new vocabulary, i.e. the length of the tokenizer. model.resize_token_embeddings(len(tokenizer)) ```rzToken z is not a string but a .rF)rstriplstrip normalizedspecialT)rr do_lower_caseradditional_special_tokenszAdding z to the vocabulary)rcopyr6rrPr rtyperall_special_tokens __setstate__rrrgetattrrlower_special_tokens_mapr7verboserIinfo _update_trier) r#rr added_tokens current_vocabnew_idxr+ is_special token_indexs r%rzPreTrainedTokenizer._add_tokenss<   (--/ m$$ AEec:%67&/FtE{mST UVV5zR%%D666"'$*A*A!A!S^J&eE*n^hE ""t5CSCS#TU222==U%5%5'$Y^:_ % 3 3 5 }}M1% 4 /: emm,! +EMM: }}U43J3J!J(()DELLUS6;D & &{ 38CD & &u}} 5|| geW,>?@I$ AL  %%'r'Nunique_no_split_tokenscP|jjD]J}|j|jjvs&|jj |jL|xsgD]6}||jjvs|jj |8yrV)rvaluesrrr r*)r#rr+s r%rz PreTrainedTokenizer._update_trieNs//668 4E}}D$4$4$<$<<  $$U]]3 4,1r ,ED,,444  $$U+ ,r'paircXg}g}t|j||r |SdS)aG Returns the number of added tokens when encoding a sequence with special tokens. This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not put this inside your training loop. Args: pair (`bool`, *optional*, defaults to `False`): Whether the number of added tokens should be computed in the case of a sequence pair or a single sequence. Returns: `int`: Number of special tokens added to sequences. N)r6 build_inputs_with_special_tokens)r#r token_ids_0 token_ids_1s r%num_special_tokens_to_addz-PreTrainedTokenizer.num_special_tokens_to_addVs5&  488UYkdee_cdeer'r1c |jd|j}|j|fi|\}}|rtj d|dt |dr|j r|jDcgc]}tj|}}||jjDcgc]9}|js+|jrtj|j;c}z }ddj|zdzdz}tj |d |}|rg}|g}n5|j"j%}|j&j)|}t+|D]*\} } | |vs |jj-|j"| d } | d kDr|| d z nd } | t/|d z kr|| d znd } t1| t2r| j4r| r| j7|| d z<| j6r| r| j5|| d z <| j8r | r| d dk7r|| d z xx| z cc<d|| <| j8s| s| d dk7s| || d zz|| d z<d|| <t;| dt=| g}|D];} | s| |vr|j?| |jA|jC| =|Scc}wcc}w)a$ Converts a string into a sequence of tokens, using the tokenizer. Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces). Takes care of added tokens. Args: text (`str`): The sequence to be encoded. **kwargs (additional keyword arguments): Passed along to the model-specific `prepare_for_tokenization` preprocessing method. Returns: `list[str]`: The list of tokens. split_special_tokenszKeyword arguments z not recognized.r(|z)|z(.+?)cn|jdxs!|jdjS)Nrr)groupsr)ms r%rz.PreTrainedTokenizer.tokenize..s(QXXZ]-Sahhjm>Q>Q>Sr'Nrrrrkrzy cannot be tokenized because it was not properly added to the tokenizer. This means that it is not an `AddedToken` but a )"rrprepare_for_tokenizationrIwarningrrrreescaperrrrrjoinsubrkeysrrGr4getr6rr rr single_word ValueErrorrr7rc _tokenize)r#r1rrs_tokescaped_special_tokspatternno_split_tokenrKir+ tok_extendedleftrighttokenized_texts r%tokenizezPreTrainedTokenizer.tokenizems &zz*@$B[B[\4t44TDVD f  NN/x7GH I 4 )d.@.@CGCZCZ#\BIIe$4#\ #\ "88??A%}})9)9 %--(%  TYY';<N%%++D1F"&) HAu&#99==d>X>XY^>_aef ()Ava!e}4)*S[1_)<q1u $lJ7#**u). q1u #**t(, q1u #//DT"X_q1u . $&q %11eaC(-q1u (=q1u $&q $'.)^^bco^p]qs) 2 =E&%%e,%%dnnU&;< =k$]%s 4K/>K#c t)a Converts a string into a sequence of tokens (string), using the tokenizer. Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces). Do NOT take care of added tokens. r)r#r1rs r%rzPreTrainedTokenizer._tokenizes "!r'rKc|yt|tr|j|Sg}|D]"}|j|j|$|S)aT Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the vocabulary. Args: tokens (`str` or `list[str]`): One or several token(s) to convert to token id(s). Returns: `int` or `list[int]`: The token id or list of token ids. N)rrP#_convert_token_to_id_with_added_vocr7)r#rKidsr+s r%convert_tokens_to_idsz)PreTrainedTokenizer.convert_tokens_to_idssY > fc ";;FC C HE JJt??F G H r'cd|y||jvr|j|S|j|SrV)r_convert_token_to_idr#r+s r%r z7PreTrainedTokenizer._convert_token_to_id_with_added_vocs: = D.. .--e4 4((//r'ctrVrrs r%r z(PreTrainedTokenizer._convert_token_to_id!!r'Tr text_pairadd_special_tokenspadding_strategytruncation_strategy max_lengthstrideis_split_into_wordspad_to_multiple_of padding_sidereturn_tensorsreturn_token_type_idsreturn_attention_maskreturn_overflowing_tokensreturn_special_tokens_maskreturn_offsets_mapping return_lengthrc fd}|r td||}|||nd}j||||j|j||| | | d| | ||||S)Nct|tr$j|fi}j|St|tt frjt |dkDr\t|dtrIr6t tjfd|D}j|Sj|St|tt fr#t |dkDrt|dtr|Srtd|dtd|d)Nrc3HK|]}j|fddiywrTNr.0trr#s r% zJPreTrainedTokenizer._encode_plus..get_input_ids..()mcd-$--*^t*^W]*^)m"zInput z] is not valid. Should be a string or a list/tuple of strings when `is_split_into_words=True`.zW is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers. rrPrr rQr)r6 itertoolschainrrr1rKrrr#s r% get_input_idsz7PreTrainedTokenizer._encode_plus..get_input_idss$$&t6v611&99D4-0SY]zRVWXRY[^G_&!!)mhl)mnF 55f==55d;;D4-0SY]zRVWXRY[^G_ &$ '77 % '%%r'a return_offset_mapping is not available when using Python tokenizers. To use this feature, change your tokenizer to one deriving from transformers.PreTrainedTokenizerFast. More information on available tokenizers at https://github.com/huggingface/transformers/pull/2674T)pair_idsrpadding truncationrrrrrprepend_batch_axisrrrrr r)rprepare_for_modelr)r#r1rrrrrrrrrrrrrrrr rrr0 first_ids second_idss` ` ` r% _encode_plusz PreTrainedTokenizer._encode_pluss, 4 "%H "$' 1:1F]9-D %% 1$***00!1%)#"7"7&?'A'#&  r'batch_text_or_text_pairsrc ^fd}|r tdg}|D]d}t|ttfrrt|dttfs|d}}n|\}}||}|||nd}|j ||ffj |||||||| | | | ||| ||}t |S)Nct|tr$j|fi}j|St|tt frjt |dkDr\t|dtrIr6t tjfd|D}j|Sj|St|tt fr#t |dkDrt|dtr|Std)Nrc3HK|]}j|fddiywr$r%r&s r%r)zPPreTrainedTokenizer._batch_encode_plus..get_input_ids..[r*r+z\Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.r,r/s r%r0z=PreTrainedTokenizer._batch_encode_plus..get_input_idsTs$$&t6v611&99D4-0SY]zRVWXRY[^G_&!!)mhl)mnF 55f==55d;;D4-0SY]zRVWXRY[^G_  rr'zreturn_offset_mapping is not available when using Python tokenizers. To use this feature, change your tokenizer to one deriving from transformers.PreTrainedTokenizerFast.r)rrrrrrrrrrrr rrr)rrrQr)r7_batch_prepare_for_modelr )r#r9rrrrrrrrrrrrrrr rrrr0 input_idsids_or_pair_idsr r1r6r7 batch_outputss` ` ` r%_batch_encode_plusz&PreTrainedTokenizer._batch_encode_plus7s: & "%8   7 6Ou >&"?1#5e}E /X / X%c*I4<4Hx0dJ   i4 5 655 1- 3!1%"7"7&?'A')!5!6 &]++r'batch_ids_pairscni}|D]\}}|j|||tjj|j||ddd| | | |dd||}|j D]"\}}||vrg||<||j |$|j ||j|||| }t|| }|S)a Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It adds special tokens, truncates sequences if overflowing while taking into account the special tokens and manages a moving window (with user defined stride) for overflowing tokens Args: batch_ids_pairs: list of tokenized input ids or input ids pairs NF)rr2r3rrrrrrrrr rr4rr)r2rrrr) tensor_type)r5r DO_NOT_PADrr5r7padr )r#rBrrrrrrrrrrrrr rrr@r6r7outputsrrs r%r=z,PreTrainedTokenizer._batch_prepare_for_models: %4 1 !Iz,,#5'2288.44%#'!&+&;*C+E+##(%9%-G*&mmo 1 Um+)+M#&c"))%0 1- 16 $**!1%"7 ! &mP r'c ||fS)a Performs any necessary transformations before tokenization. This method should pop the arguments from kwargs and return the remaining `kwargs` as well. We test the `kwargs` at the end of the encoding process to be sure all the arguments have been used. Args: text (`str`): The text to prepare. is_split_into_words (`bool`, *optional*, defaults to `False`): Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace) which it will tokenize. This is useful for NER or token classification. kwargs (`dict[str, Any]`, *optional*): Keyword arguments to use for the tokenization. Returns: `tuple[str, dict[str, Any]]`: The prepared text and the unused kwargs. rR)r#r1rrs r%rz,PreTrainedTokenizer.prepare_for_tokenizations,f~r'rralready_has_special_tokensc|r| tdt| ||dSdg|r t|ndt|zzS)a Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods. Args: token_ids_0 (`list[int]`): List of ids of the first sequence. token_ids_1 (`list[int]`, *optional*): List of ids of the second sequence. already_has_special_tokens (`bool`, *optional*, defaults to `False`): Whether or not the token list is already formatted with special tokens for the model. Returns: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. zYou should not supply a second sequence if the provided sequence of ids is already formatted with special tokens for the model.T)rrrIr)rrWget_special_tokens_maskr6)r#rrrIrXs r%rKz+PreTrainedTokenizer.get_special_tokens_masksd$ && R 72'[]a3 s;s;'A[AQQRRr'r skip_special_tokenscyrVrRr#r rLs r%convert_ids_to_tokensz)PreTrainedTokenizer.convert_ids_to_tokenssY\r'cyrVrRrNs r%rOz)PreTrainedTokenizer.convert_ids_to_tokenssehr'ct|tr8||jvr|j|jS|j |Sg}|D]u}t|}|r||j vr||jvr)|j |j|jV|j |j |w|S)a Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and added tokens. Args: ids (`int` or `list[int]`): The token id (or token ids) to convert to tokens. skip_special_tokens (`bool`, *optional*, defaults to `False`): Whether or not to remove special tokens in the decoding. Returns: `str` or `list[str]`: The decoded token(s). )rrrr_convert_id_to_tokenall_special_idsr7)r#r rLrKrs r%rOz)PreTrainedTokenizer.convert_ids_to_tokenss c3 d00011#6>>>0055 @EJE"u0D0D'D222 d88?GGH d77>? @ r'rctrVr)r#rs r%rRz(PreTrainedTokenizer._convert_id_to_token4rr'c$dj|S)Nrk)r)r#rKs r%convert_tokens_to_stringz,PreTrainedTokenizer.convert_tokens_to_string7sxxr' token_idsclean_up_tokenization_spacesspaces_between_special_tokensc 6|jdd|_|j||}t|tr|g}t |j jt |jz |jDchc]#}|j||jk\s"|%c}z}g} g} |D]n}|r||jvr||vrF| r2|j| } t| dkDr| j| g} | j|^| j|p| r | j|j| |rdj| } ndj| } ||n |j }|r|j#| } | S| Scc}w)Nuse_source_tokenizerF)rLrrkr)rrrOrrPrrrrrr rrVr6r7rrXclean_up_tokenization)r#rWrLrXrYrfiltered_tokensr+legacy_added_tokens sub_textscurrent_sub_textstringr1 clean_texts r%_decodezPreTrainedTokenizer._decode:s-3JJ7Mu,U)44YTg4h os +./O!$"<"<"A"A"CDs4KbKbGcc#==g A[A[\aAbfjfufuAuEg    $ /E"u0G0G'G++#!::;KLF6{Q!((0')$  ' ''. /    T::;KL M (88I&D779%D,7 )22 % (33D9J KMg s #F(F)FrVr)FNT):rLrMrNrOr&propertyrrrrrgrPrr rsetterrrrrrQrrrrrrrr r r rrErDO_NOT_TRUNCATErrrr r8rrrrArr r r)r=rrrKrrOrRrVrcrhris@r%rrs7 22"C"" od38nooZd3 ?&;ZZ   ($sE*c/4J/J*K (PTUXZdUdPe (! ( *c3h *% 6LeDItJ7G,G&HLZ^LknL\,8DI3F,fdfsf.NYNT#YN`"E#tCy.,AeCQUVYQZNF[,0" RV#',;,F,F2D2T2T$($),0&*;?0404*/+0',#'N I0,>?N E)-> "LMNN ! N * N 0 N SMN N "N %SMN smN !sJ!78N  (~N  (~N $(N %)!N "!%#N $%N &'N * +N t$(,;,F,F2D2T2T$($),0&*;?0404*/+0',#%*5Y,"' O   " # & '   ! "  $# Y,!Y,*Y,0Y,SMY,Y,"Y, %SM!Y,"sm#Y,$!sJ!78%Y,& (~'Y,( (~)Y,*$(+Y,,%)-Y,.!%/Y,01Y,23Y,4#5Y,8 9Y,v/1XY$(,;,F,F2D2T2T$(,0&*(,0404*/+0#%*#Ce$95cD;Q$QRSC!C* C 0 C SM CC%SMCsmC! C (~C (~C$(C%)CC !C"##C$ %CZCL6;.2 sDcN" #2inSS.6tnSaeS cS<\\4\TW\\ hchhZ^_bZchhGLd3i(?C sDI~ @"#"#" tCy S %*7;.2 5d3i(5"5'/tn 5 (, 5 5r'r)0rOrr-rrp collectionsrtypingrrrrtokenization_utils_baser r r r r rrrrrrrrutilsrrrr get_loggerrLrISPECIAL_TOKENS_MAP_FILEADDED_TOKENS_FILETOKENIZER_CONFIG_FILErrTrtrxrrrrQrPrrrRr'r%ros  #11LK   H %4'/ddN8T8v   a d 4$s) 4 4,-X 1X .X r'