L idddlmZmZmZmZmZddlmZddlm Z ddl m Z GddZ de fdZy ) )Regex Tokenizerdecoderspre_tokenizers processors)BPE)LlamaTokenizerFast)bytes_to_unicodec>eZdZdZ d dZdefdZdZdefdZ y) MistralConverterz' A general tiktoken converter. Nc <||_||_||_||_y)N)vocabpatternadd_prefix_spaceadditional_special_tokens)selfrrrrkwargss g/mnt/ssd/data/python-lab/Trading/venv/lib/python3.12/site-packages/transformers/integrations/mistral.py__init__zMistralConverter.__init__ s"  0)B&rc& | t fd}g}i}t jD]\}\}}||jvr||||<t |dk(r1g}t dt |D]2}|d|||d} } | vs| vs| | z vs|j | | |f4t| fdd}|j||||<t|dd}|D cgc]} || d|| df}} ||fScc} w)Nc dj|jdDcgc]}t|c}Scc}w)Nzlatin-1)joindecodeord)bchar byte_encoders rtoken_bytes_to_stringzOMistralConverter.extract_vocab_merges_from_model..token_bytes_to_strings277@STLT3TU UTs<c$|d|dfS)Nrr!)x bpe_rankss rzBMistralConverter.extract_vocab_merges_from_model..-sYqt_iPQRSPTo4VrF)keyreversec |dS)Nr#)vals rr&zBMistralConverter.extract_vocab_merges_from_model..1s Arr) r enumerateitemsrlenrangeappendsortedextend)rrr mergesidxtokenranklocalindexpiece_lpiece_rr+r%rs @@rextract_vocab_merges_from_modelz0MistralConverter.extract_vocab_merges_from_modelsK ')  V"+IOO,="> # C%D:::69+E23u:?"1c%j1?E',Ve}eEFmWG)+90D'T[J[`iIi gw%=>?u*V`ef e$"e  #$6F\bcUX(Q02GA2OPccf}ds+Dc|j|j\}}tt||d}t |j drd|j _|S)NF)fuse_unk ignore_mergesT)r;rrrhasattrmodelr>)r vocab_scoresr3 tokenizers rrBzMistralConverter.tokenizer5sN#CCDJJO fc,GH 9??O 4,0IOO )rreturnc|j}tjtjt |j ddtj |jdg|_tj |_ |j|jtj d|_|S)NisolatedF)behaviorinvert)r use_regex) trim_offsets)rBrSequenceSplitrr ByteLevelr pre_tokenizerrdecoderadd_special_tokensrrpost_processor)rrBs r convertedzMistralConverter.converted<sNN$ "0"9"9$$U4<<%8:V[\(($:O:O[`a #  %..0 $$T%C%CD#-#7#7U#K  r)Nzs(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+FN) __name__ __module__ __qualname____doc__rstrr;rBrrQr#rrr r s; K"& CS6 9 rr tokenizer_filecddlm}|j|}|jjj }|jjj Dcgc]}t|dr |jn|}}|Dcic]}||j|}}|j||}tt||j}|jd|i|Scc}wcc}w)z1Convert a "tekken" tokenizer to a fast Tokenizer.r)MistralTokenizervalue)rr)tokenizer_objectr)(mistral_common.tokens.tokenizers.mistralrY from_fileinstruct_tokenizerrB_tekken_token2id_nospecial_all_special_tokensr?rZr8updater r rQrO)rWrYmistral_tokenizerrr5 all_specialspecials_tokensrBs rconvert_tekken_tokenizerreLsJ)22>B  0 0 : : U UE'99CCWW ug. E9KEPP5uk//66POP5! E#)Q\]ggiI   "={!KL !Qs !C)C.N) tokenizersrrrrrtokenizers.modelsr transformersr #transformers.convert_slow_tokenizerr r rVrer#rrrjs-MM!+@AAHSr