L iEdZddlZddlmZddlmZddlmZmZm Z m Z m Z m Z m Z ddlmZmZmZddlmZmZmZmZdd lmZej2eZdd Zd ed efd ZdZ GddZ!Gdde!Z"ded efdZ#GddZ$Gdde$Z%Gdde$Z&Gdde$Z'Gdde$Z(Gdd e$Z)Gd!d"e$Z*Gd#d$e$Z+Gd%d&e$Z,Gd'd(e$Z-Gd)d*e$Z.Gd+d,e$Z/Gd-d.e$Z0Gd/d0e0Z1Gd1d2e0Z2Gd3d4e0Z3Gd5d6e0Z4Gd7d8e0Z5Gd9d:e0Z6Gd;de0Z8Gd?d@e0Z9GdAdBe0Z:GdCdDe0Z;GdEdFe0Z<GdGdHe0Z=GdIdJe0Z>GdKdLe0Z?GdMdNe0Z@GdOdPe$ZAGdQdRe0ZBGdSdTe$ZCGdUdVe$ZDGdWdXe$ZEGdYdZe0ZFGd[d\e0ZGGd]d^e0ZHGd_d`e$ZIGdadbe0ZJGdcdde0ZKGdedfe0ZLdgZMGdhdiZNidje1dke-dle2dme%dneBdoeEdpe3dqeCdre*dse%dte/due4dve%dwe%dxe%dye%dze%id{e1d|e'd}e*d~e+de%de%de-de9de-de-de%deIde5de6de(de%de-ide7de)de>de,de%de;de<de%de-de.de8de%de?de@deAde9de:e&eFeHeHeGeHdZOdd e fdZPy)z Utilities to convert slow tokenizers in their fast tokenizers counterparts. All the conversions are grouped here to gather SentencePiece dependencies outside of the fast tokenizers files and allow to make our dependency on SentencePiece optional. N)Optional)version) AddedTokenRegex Tokenizerdecoders normalizerspre_tokenizers processors)BPEUnigram WordPiece)is_protobuf_availableis_sentencepiece_availableloggingrequires_backends)PROTOBUF_IMPORT_ERRORctrddlm}|StrSddl}t j |jjt j dkrddl m}|Sddl m }|Sttj|)Nr)sentencepiece_model_pb2z4.0.0)sentencepiece_model_pb2_new)r sentencepiecerrgoogle.protobufrparseprotobuf __version__transformers.utilsr ImportErrorrformat) error_messagergoogles i/mnt/ssd/data/python-lab/Trading/venv/lib/python3.12/site-packages/transformers/convert_slow_tokenizer.pyimport_protobufr##sl!#9&& ==44 5 g8N N B'& b&&/66}EFFadd_prefix_spacereturnc4|rd}t|ddsd}|Sd}|S)NalwayslegacyTfirstnever)getattr)r%original_tokenizerprepend_schemes r"_get_prepend_schemer/4s1!)8T:$N ! r$c|du}|r t|n}g}|jD]j\}}g}tdt|D]*}|d|||d} }|vs| vs|j || |f,t |fd}|j |lt |d|}|D cgc] } | d| df}} |Scc} w)Nrc$|d|dfSNrr)xvocabs r"z!generate_merges..IsU1Q4[%!+,Fr$keycB|dt|dt|dfS)Nrr)lenvals r"r6z!generate_merges..Ls!SVSQ[#c!f+,Nr$r8reverser)dictitemsranger;appendsortedextend) r5 vocab_scoresr?mergesmerge piece_scorelocalindexpiece_lpiece_rr=s ` r"generate_mergesrN>s$&G)04 %eL F*002{1c%j) >E$Ve}eEFmWG%Gu$4 gw <= >u"FG eF NX_ `F*0 13s1vs1v 1F 1 M2s'B<cDeZdZdZdefdZddeeeefe effdZ y)SentencePieceExtractorzl Extractor implementation for SentencePiece trained models. https://github.com/google/sentencepiece modelcvt|dddlm}||_|jj |y)Nrr)SentencePieceProcessor)rrrSspLoad)selfrQrSs r"__init__zSentencePieceExtractor.__init__Vs)$08(*  Ur$Nr&c|j}t|jDcic]}|j||}}t ||}||fScc}w) By default will return vocab and merges with respect to their order, by sending `vocab_scores` we're going to order the merges with respect to the piece scores instead. )rTrB GetPieceSize id_to_piecerNrVrFrTrKr5rGs r"extractzSentencePieceExtractor.extract]sV WW;@AR;ST%&-TT  5f} UsAN) __name__ __module__ __qualname____doc__strrWtupler@intlistr]r3r$r"rPrPQs5c E$sCx.$u+2M,N r$rPc4eZdZddeeeefeeffdZy)GemmaSentencePieceExtractorNr&c|j}t|jDcic]}|j||}}d|vr|j d|d<t ||}||fScc}w)rY <0x09>)rTrBrZr[getrNr\s r"r]z#GemmaSentencePieceExtractor.extractksr WW;@AR;ST%&-TT u ))H-E$K  5f}UsA+r^) r_r`rardr@rcrerfr]r3r$r"rhrhjs$ E$sCx.$u+2M,N r$rhpiecec^t|dkxs|ddk7xs|dj S)Nr:,)r;isdigit)rms r"check_number_commars{s3 u:> HU2Y#- HU2Y5F5F5H1HHr$ceZdZdZdefdZy) Converterc||_yr^)r-)rVr-s r"rWzConverter.__init__s "4r$r&ctr^)NotImplementedErrorrVs r" convertedzConverter.converteds !##r$N)r_r`rarWrrzr3r$r"rurus5$9$r$ruceZdZdefdZy) BertConverterr&c l|jj}tt|t |jj }d}d}d}t |jdr`|jjj}|jjj}|jjj}tjd||||_ tj|_t |jj"}t |jj$}|jj&}|jj(} t+j,|d|d|d|d|d ||f|| fg |_t1jd |_|S) N unk_tokenFbasic_tokenizerT clean_texthandle_chinese_chars strip_accents lowercase:0 $A:0 :0:0 $B:1 :1singlepairspecial_tokens##prefixr-r5rrrcrhasattrrtokenize_chinese_charsr do_lower_caser BertNormalizer normalizerr BertPreTokenizer pre_tokenizer cls_token sep_token cls_token_id sep_token_idr TemplateProcessingpost_processorrdecoder rVr5 tokenizerrrrclsseprrs r"rzzBertConverter.converted''--iT=T=T=^=^9_`a !&  4**,= >%)%<%<%L%L%c%c " 33CCQQM 33CCQQM*99!7'#   #1"A"A"C $))334$))334..;; ..;; #-#@#@U(3%r*5XcU"5l#l#$  %..d; r$Nr_r`rarrzr3r$r"r|r|#9#r$r|ceZdZdefdZy)SplinterConverterr&c |jj}tt|t |jj }d}d}d}t |jdr`|jjj}|jjj}|jjj}tjd||||_ tj|_t |jj"}t |jj$}t |jj&}d} |jj(} |jj*} |jj,} |jj/d} |jj0dk(r|d|d | d |d |d }n|d|d |d | d |d }t3j4|d|d ||| f|| f|| f| | fg |_t9jd|_|S)Nr~FrTr.rightr rrrrrr)r-r5rrrcrrrrrrr rrr rrrrquestion_tokenrrquestion_token_idconvert_tokens_to_ids padding_sider rrrr)rVr5rrrrrrquestiondotrrr dot_token_idrs r"rzzSplinterConverter.converteds"''--iT=T=T=^=^9_`a !&  4**,= >%)%<%<%L%L%c%c " 33CCQQM 33CCQQM*99!7'#   #1"A"A"C $))334$))334t..==>..;; ..;;  33EE..DDSI  " " / /7 :U(8*AcU!C5RHDU(3%xz3%qRHD#-#@#@U(3%r*l#l#,-l#  $  %..d; r$Nrr3r$r"rrs.9.r$rceZdZdefdZy)FunnelConverterr&c l|jj}tt|t |jj }d}d}d}t |jdr`|jjj}|jjj}|jjj}tjd||||_ tj|_t |jj"}t |jj$}|jj&}|jj(} t+j,|d|d|d|d|d ||f|| fg |_t1jd |_|S) Nr~FrTrz:2 $A:0 rrrrrrrrs r"rzzFunnelConverter.convertedrr$Nrr3r$r"rrrr$rceZdZdefdZy)MPNetConverterr&c r|jj}tt|t |jj }d}d}d}t |jdr`|jjj}|jjj}|jjj}tjd||||_ tj|_t |jj"}t |jj$}|jj&}|jj(} t+j,|d|d|d|d|d |d ||f|| fg |_t1jd |_|S)Nr~FrTrrrz:0 rrrrrrrs r"rzzMPNetConverter.converteds''--iT=T=T=^=^9_`a !&  4**,= >%)%<%<%L%L%c%c " 33CCQQM 33CCQQM*99!7'#   #1"A"A"C $))334$))334..;; ..;; #-#@#@U(3%r*5SXcU"=l#l#$  %..d; r$Nrr3r$r"rrrr$rceZdZdefdZy)OpenAIGPTConverterr&c |jj}t|jjj }|jj }t t||dt|dd}|jt||jt|gtjd|_ tj|_t#j$d|_|S)NF)r5rGdropoutrend_of_word_suffixfuse_unkT)rsuffix)r-encoderrf bpe_rankskeysrrr rc token_to_idadd_special_tokensr rrr rrr BPEDecoderrrVr5rGrrs r"rzzOpenAIGPTConverter.converted/s''//d--77<<>?++55  i.#)      Y 0 <  ( (#i.)9 :*99DI "0"A"A"C $//v> r$Nrr3r$r"rr.s9r$rc JeZdZ ddeeeefdeeeeefde fdZ y) GPT2ConverterNr5rGr&c N|s|jj}|st|jj}t t ||dddd}t |jdd}tj||_ tj|_ t |jddrT|jj}|jj}tj|d|d||fg |_|Stjd |_|S) NFr5rGrcontinuing_subword_prefixrrr%r% add_bos_tokenz:0 $A:0z :0 $A:0 $B:1r trim_offsets)r-rrfrrr r,r ByteLevelrrr bos_token bos_token_idr rr)rVr5rGrr%bosrs r"rzzGPT2Converter.convertedJs++33E$11;;FtERUWZRZOG\>]$ $r$rceZdZdefdZy)HerbertConverterr&c d}d}|jj}t|jjj }||ddvr|dd}t t ||d|jj|}tjdd|_ tj|_ tj||_t#j$|jj&|jj(f|jj*|jj,f |_|S) Nz #version:rrr)rrrF)rrr)rr)r-rrfrrrr rr rrr rrrrrr BertProcessingrrrrr)rVtokenizer_info_str token_suffixr5rGrs r"rzzHerbertConverter.convertedrs( ''//d--77<<>? 1 -ABZF 11;;#/    +99EY^_ "0"A"A"C $//|D #-#<#<((22D4K4K4X4XY((22D4K4K4X4XY$  r$Nrr3r$r"rrq9r$rc JeZdZ ddeeeefdeeeeefde fdZ y)Qwen2ConverterNr5rGr&c 0|s|jj}|s-t|jjj }t t ||dddddd}tj|_ tjtjtdddtjt|jdddg|_t#j|_t'jd |_|S) NrF)r5rGrrrrr byte_fallbackzn(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+isolatedbehaviorinvertr%r% use_regexr)r-rrfrrrr r NFCrr SequenceSplitrrr,rrrr r)rVr5rGrs r"rzzQwen2Converter.converteds++33E$11;;@@BCF *,#%#    +0 "0"9"9$$N( ((%,T-D-DFXZ_%`# #  %..0 #-#7#7U#K  r$rrr3r$r"rrsD`d*d38n-*>FtERUWZRZOG\>]* *r$rceZdZdefdZy)RobertaConverterr&c |j}|j}t|jj }t t ||dddd}tj|j|_ tj|_ tj|j|j f|j"|j$f|jd|_|S)NrFrrTrrr%r)r-rrfrrrr r rr%rrrr RobertaProcessingrrrrrrVotr5rGrs r"rzzRobertaConverter.converteds  $ $ bll'')* *,#%    #1":":BL_L_"` $..0 #-#?#?r/r/00 $  r$Nrr3r$r"rr9r$rceZdZdefdZy)RoFormerConverterr&c Vddlm}|jj}t t |t |jj}d}d}t|jdr@|jjj}|jjj}tjdd|||_tj j#|||_t |jj&}t |jj(}|jj*}|jj,} t/j0|d|d |d|d |d ||f|| fg |_t5j d |_|S)Nr)JiebaPreTokenizerr~FrTrrrrrrrr)"models.roformer.tokenization_utilsrr-r5rrrcrrrrrr rrr PreTokenizercustomrrrrrr rrrr) rVrr5rrrrrrrs r"rzzRoFormerConverter.convertedsyI''--iT=T=T=^=^9_`a   4**,= > 33CCQQM 33CCQQM*99!&'#   #1"="="D"DEVW\E]"^ $))334$))334..;; ..;; #-#@#@U(3%r*5XcU"5l#l#$  %..d; r$Nrr3r$r"rrrr$rceZdZdefdZy)DebertaConverterr&c |j}|j}t|jj }t t ||dddd}tj|j|_ tj|_ tjddd|jjdfd|jjdfg |_|S) NrFrr[CLS]:0 $A:0 [SEP]:0![CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1[CLS][SEP]r)r-rrfrrrr r rr%rrrr rrrrs r"rzzDebertaConverter.converteds  $ $ bll'')* *,#%    #1":":BL_L_"` $..0 #-#@#@)4$11GGPQ$11GGPQ$  r$Nrr3r$r"rrrr$rc`eZdZdZeZiZfdZdZdZ dZ dZ dZ dZ d Zd efd ZxZS) SpmConverterFct|dt||t}|j }t |j jd5}|j|jddd||_ |jjjr#|jstjdyyy#1swYTxYw)NrrbaThe sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.)rsuperrWr# ModelProtoopenr- vocab_fileParseFromStringreadproto trainer_specrhandle_byte_fallbackwarningswarn)rVargs model_pb2mf __class__s r"rWzSpmConverter.__init__&s$ + $$%  " $))44d ; (q  affh ' ( :: " " 0 09R9R MMe :S 0  ( (s  CCcl|jDcgc]}|j|jfc}Scc}wr^piecesrmscorerVrrms r"r5zSpmConverter.vocab;s'8= Euekk*EEEs1c.|jjSr^)runk_idrVrs r"rzSpmConverter.unk_id>s!!(((r$c ~|jj}|j|}|dk(r1tt ||j ||j }n|dk(r|j|jjj|\}}t|D cic] \}\}} || } }}} tt| ||jjd|j d}n tdt|jD cgc]I\} } | j dvr6| | j"| j dk(xs| j"|j$vfK} } } |j't)| d D cgc]\} }}t+|d | c}}} |Scc} }}wcc} } wcc}}} w) Nrrrr:Trrrrz]You're trying to run a `Unigram` model but you're file was trained with a different algorithmr$c |dSNrr3r4s r"r6z(SpmConverter.tokenizer..m QRSTQUr$r7F normalizedspecial)r model_typer5rr rr SpmExtractorr-r r] enumerater unk_piece Exceptionrtypermr add_tokensrDr)rVrr-rFr_rGiwordr bpe_vocabidpspm_added_tokenstokenr,s r"rzSpmConverter.tokenizerAs''22 zz%( ?! ;;u-"&";";I1_))$*A*A*L*LMUUVbcIAv9B<9PQQ%5Q uqQIQ!#00::!"&";";  Io #5<<0 Avv!&&A+GD4G4G)G H  +11A~*V  &Bw5UGD  CR*  s)F+AF2F8 c |jj}tjddtjt ddg}|stj |Stj tj|g|zS)NFT)leftr {2,}▁)normalizer_specprecompiled_charsmapr StripReplacerr PrecompiledrVrrA _normalizerss r"rzSpmConverter.normalizerss{$44II   5 5   g 6 $'' 5 5'')@)@AU)V(WZf(fg gr$c\t||j}tj||SN replacementr.)r/r-r MetaspacerVrJr%r.s r"rzSpmConverter.pre_tokenizer~s),-=t?V?VW''KP^__r$cyr^r3rys r"rzSpmConverter.post_processorsr$c\t||j}tj||SrH)r/r-rrKrLs r"rzSpmConverter.decoders(,-=t?V?VW!!k.YYr$r&cz|j|j}|j|j}|||_d}d}t|jdr|jj }|j ||}|||_|j|||_|j}|r||_|S)Nr?Tr%) rrrrr-r%rrr)rVrrrJr%rrs r"rzzSpmConverter.convertedsNN4::. __TZZ0  !#-I   4**,> ?#66GG **;8HI  $&3I # LL6FG ,,. '5I $r$)r_r`rarrPr.rrWr5rrrrrrrrz __classcell__)rs@r"rr!sL )LN*F)0d h`Z9r$rceZdZdZdZdZy)AlbertConverterc|jDcgc]J}t|jr|j|jfn|j|jdz fLc}Scc}wNdrrsrmrrs r"r5zAlbertConverter.vocab^ +=U[[*IU[[%++ &PUP[P[]b]h]hkn]nOo o   AA!ctjddtjddg}|jjsF|j tj |j tj |jjr#|j tj|jj}|r$|j tj||j tjtddtj|SNz``"z''r>rr rCr- keep_accentsrCNFKD StripAccentsr Lowercaser@rArDrrrVrlist_normalizersrAs r"rzAlbertConverter.normalizer   c *   c * &&33  # #K$4$4$6 7  # #K$<$<$> ?  " " 0 0  # #K$9$9$; <$44II   # #K$;$; $A z $A $B rrfrys r"rzBarthezConverter.post_processorR,, +//EEeLM00FFvNO  r$N)r_r`rarrr3r$r"rkrks  r$rkceZdZdZdZdZy)CamembertConvertercgd}||jddDcgc]}|j|jfc}z }|dgz }|Scc}w)N))z NOTUSEDrx)z NOTUSEDrxzrx)z NOTUSEDirzrxrrVrr5rms r"r5zCamembertConverter.vocabsP  %,,qr:JK5;; ,KK /"" LAcyrmr3rs r"rzCamembertConverter.unk_idsr$c tjddd|jjdfd|jjdfgSrqrfrys r"rz!CamembertConverter.post_processorrtr$Nr_r`rar5rrr3r$r"rvrvs  r$rvceZdZdZdZdZy)DebertaV2Convertercg}|jjr%|jtjdt ||j}|jtj ||tj|S)Nr)rrI)r-split_by_punctrCr Punctuationr/rKr)rVrJr%list_pretokenizersr.s r"rz DebertaV2Converter.pre_tokenizersq  " " 1 1  % %n&@&@*&U V,-=t?V?VW!!.":":{cq"rs&&'9::r$cg}|jjr#|jtj|jtj |j j}|r$|jtj||jtjtddtj|S)Nr>r) r-rrCr r`rBr@rArDrCrrras r"rzDebertaV2Converter.normalizers  " " 0 0  # #K$9$9$; < 1 1 34$44II   # #K$;$; en_XXz$A $B en_XXrrsrrfrys r"rzMBartConverter.post_processor?R,,"#$11GGPQ00FFvNO  r$Nrr3r$r"rrs$L r$rceZdZdZdZdZy)MBart50Convertercgd}||jddDcgc]}|j|jfc}z }|gdz }|dgz }|Scc}w)Nrr$)4rrrrrrrrrrrrrrrrrrrrrrrrr)af_ZArx)az_AZrx)bn_INrx)fa_IRrx)he_ILrx)hr_HRrx)id_IDrx)ka_GErx)km_KHrx)mk_MKrx)ml_INrx)mn_MNrx)mr_INrx)pl_PLrx)ps_AFrx)pt_XXrx)sv_SErx)sw_KErx)ta_INrx)te_INrx)th_THrx)tl_XXrx)uk_UArx)ur_PKrx)xh_ZArx)gl_ESrx)sl_SIrxr|rr}s r"r5zMBart50Converter.vocabKsa  %,,qr:JK5;; ,KK R R  /"" Lrcyrmr3rs r"rzMBart50Converter.unk_idWrr$c tjddd|jjdfd|jjdfgS)Nz en_XX $A zen_XX $A $B rrsrrfrys r"rzMBart50Converter.post_processorZrr$Nrr3r$r"rrJs  r$rceZdZdZdZdZy) NllbConvertercgd}||jddDcgc]}|j|jfc}z }|Scc}w)Nrr$rr}s r"r5zNllbConverter.vocabfC  %,,qr:JK5;; ,KK L=cyrmr3rs r"rzNllbConverter.unk_idprr$c tjddd|jjdfd|jjdfgS)Nzeng_Latn $A zeng_Latn $A $B eng_Latnrsrrfrys r"rzNllbConverter.post_processorssR,,%&T44JJ:VW00FFvNO  r$Nrr3r$r"rres r$rceZdZdZdZdZy)SeamlessM4TConvertercgd}||jddDcgc]}|j|jfc}z }|Scc}w)N)ryr{rrr$rr}s r"r5zSeamlessM4TConverter.vocabrrc.|jjSr^)r- unk_token_idrs r"rzSeamlessM4TConverter.unk_ids&&333r$c tjddd|jjdfd|jjdfgS)Nz__eng__ $A z__eng__ $A $B __eng__rsrrfrys r"rz#SeamlessM4TConverter.post_processorsR,,$%D33II)TU00FFvNO  r$Nrr3r$r"rr~s4 r$rceZdZdZdZdZy)XLMRobertaConvertercgd}||jddDcgc]}|j|jfc}z }|dgz }|Scc}w)Nrr$r|rr}s r"r5zXLMRobertaConverter.vocabsP  %,,qr:JK5;; ,KK /"" Lr~c d}|Srmr3rns r"rzXLMRobertaConverter.unk_idror$c tjddd|jjdfd|jjdfgSrqrfrys r"rz"XLMRobertaConverter.post_processorrtr$Nrr3r$r"rr  r$rceZdZdZdZdZy)XLNetConverterc|jDcgc]J}t|jr|j|jfn|j|jdz fLc}Scc}wrTrVrs r"r5zXLNetConverter.vocabrWrXctjddtjddg}|jjsF|j tj |j tj |jjr#|j tj|jj}|r$|j tj||j tjtddtj|SrZr\ras r"rzXLNetConverter.normalizerrcr$c tjddd|jjdfd|jjdfgS)Nz$A:0 :0 :2z!$A:0 :0 $B:1 :1 :2zzrrfrys r"rzXLNetConverter.post_processorrgr$Nrhr3r$r"rrrir$rc eZdZy)ReformerConverterNr_r`rar3r$r"rrr$rceZdZdZdZy)RemBertConvertercbtjddtjddtjtddg}|jjsF|j tj |j tj|jjr#|j tj|jj}|r$|j tj|tj|SrZ)r rCrr-r]rCr^r_rr`r@rArDrras r"rzRemBertConverter.normalizers   c *   c *   g 4  &&33  # #K$4$4$6 7  # #K$<$<$> ?  " " 0 0  # #K$9$9$; <$44II   # #K$;$;gY) r- pad_token eos_tokenmask_token_sent mask_token mask_token_idoffsetrBrrmr)rVrr5r5rms r"r5zPegasusConverter.vocabs%  $ $ . . 4  $ $ . . 4   " " 2 2 > t..>>DE EE  # # . . :''558O8O8V8VV t..993?@ @E %4;R;R;Y;Y2Z[QU1#Q<([[ %,,qr:JK5;; ,KK \Ks %D1 D6c\|jj|jjzSr^)rrr-rrs r"rzPegasusConverter.unk_ids%!!((4+B+B+I+IIIr$ct||j}tjtjtj ||gSrH)r/r-r rWhitespaceSplitrKrLs r"rzPegasusConverter.pre_tokenizersJ,-=t?V?VW&&..0(([Q_`   r$c|jj}||jjfg}tjd|gdd|g|S)N$A$Br)r-r eos_token_idr r)rVeosrs r"rzPegasusConverter.post_processorsR%%// $))66 7 ,,T3KtTSVFWhvwwr$N)r_r`rar5rrrr3r$r"r r s&J xr$r ceZdZdZdZy) T5Converterc|jj}|jDcgc]}|j|jf}}|t |dz ddDcgc] }d|ddf c}z }|Scc}wcc}w)Nrroz F%++u{{+FF E-!:KRQS4TUqZs!$c*UU GUs A/A4crtjddggdd|jjdfgSNrrs)rrsrrsrrfrys r"rzT5Converter.post_processor.=,,&>-00FFvNO  r$N)r_r`rar5rr3r$r"rr's   r$rceZdZdZy) UdopConvertercrtjddggdd|jjdfgSr#rfrys r"rzUdopConverter.post_processor9r$r$Nr_r`rarr3r$r"r&r&8s r$r&ceZdZdefdZy)WhisperConverterr&c |jj}t|jjj }t t ||dddd}tj|jj|_ tj|_ |jj}|jj|}|jj}|jj }dj#|Dcgc]}|d c}} t%j&| d|d| d|d ||fgt)|| |_|Scc}w) NrFrrrrz $A:0 z $A:0 $B:1 rr)r-rrfrrrr r rr%rrr prefix_tokensconvert_ids_to_tokensrrjoinr rzipr) rVr5rGrprefix_token_idsprefixesrrr;prefix_templates r"rzzWhisperConverter.convertedDsR''//d--77<<>? *,#%    #1":":DLcLcLtLt"u $..0 22@@**@@AQR%%//..;; ((h#GUugRL#GH#-#@#@%&fSE4#$KuB7l#X/0$  $Hs ENrr3r$r"r*r*Cs 9 r$r*ceZdZdZy)BigBirdConverterc tjddd|jjdfd|jjdfgSrerfrys r"rzBigBirdConverter.post_processorhrgr$Nr(r3r$r"r4r4gs r$r4ceZdZdefdZy) CLIPConverterr&c p|jj}t|jjj }|jj }t t||ddddt|}tjtjtjtddtjg|_t!jt!j"tddd t!j$d g|_t)j$|_t-j.|jj0|jj2f|jj4|jj6fdd |_|S) NrrFr5rGrrrrrz\s+rz9's|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+removedTrrr)r-rrfrrrrr rcr rrrCrr`rr rrrrrr rrrrrrrs r"rzzCLIPConverter.convertedtsk''//d--77<<>?++55  *,#)i.    +33 __  3 3E&M3 GI^I^I` a #1"9"9$$Z[& ((%@   # %..0 $.#?#?((22D4K4K4X4XY((22D4K4K4X4XY" $  r$Nrr3r$r"r7r7ss'9'r$r7ceZdZdefdZy)LayoutLMv2Converterr&c l|jj}tt|t |jj }d}d}d}t |jdr`|jjj}|jjj}|jjj}tjd||||_ tj|_t |jj"}t |jj$}|jj&}|jj(} t+j,|d|d|d|d|d ||f|| fg |_t1jd |_|S) Nr~FTrrrrrrrrrrrs r"rzzLayoutLMv2Converter.converteds''--iT=T=T=^=^9_`a !&  4**,= >%)%<%<%L%L%c%c " 33CCQQM 33CCQQM*99!7'#   #1"A"A"C $))334$))334..;; ..;; #-#@#@U(3%r*5XcU"5l#l#$  %..d; r$Nrr3r$r"r<r<rr$r<ceZdZdefdZy)BlenderbotConverterr&c |j}|j}t|jj }t t ||dddd}tj|j|_ tj|_ tjd|jd|j|j fg|_|S)NrFrrz$A:0 r)rr)r-rrfrrrr r rr%rrrr rrrrrs r"rzzBlenderbotConverter.converteds  $ $ bll'')* *,#%    #1":":BL_L_"` $..0 #-#@#@2<<.+r/$  r$Nrr3r$r"r?r?rr$r?ceZdZdZdZdZy) XGLMConvertercgd}||jddDcgc]}|j|jfc}z }|gdz }|Scc}w)Nrr$))z rx)z rx)z rx)z rx)z rx)z rx)z rxrr}s r"r5zXGLMConverter.vocabsT  %,,qr:JK5;; ,KK z z LsAc d}|Srmr3rns r"rzXGLMConverter.unk_idror$c tjddd|jjdfd|jjdfgS)Nz $Az $A $Brrrsrrfrys r"rzXGLMConverter.post_processorsR,,'//EEeLM00FFvNO  r$Nrr3r$r"rBrBrr$rBc<eZdZdZeZddhZ dZdZdZ dZ dZ y ) GemmaConverterTzz c.tjddSNrr?)r rCrs r"rzGemmaConverter.normalizer s""3..r$ct|jjdf|jjdf|jjdfg}||jddDcgc]}|j |j fc}z }td|Ds#tdt|Dd}|d||<|Scc}w)Nrxr$c3,K|] }|ddk(yw)rrjNr3).0r4s r" z'GemmaConverter.vocab..s/A1Q44</sc38K|]\}}|ddk(s|yw)rrkNr3)rLr5r4s r"rMz'GemmaConverter.vocab..s"VAQqTXEU1"Vs)rjrx) r-rrrrrmranynextr/)rVrr5rmoverride_indexs r"r5zGemmaConverter.vocabs  $ $ . . 4  $ $ . . 4  $ $ . . 4  %,,qr:JK5;; ,KK///!"V51A"VX\]N)(3n% LsB5c.tjddS)Nrmerged_with_previous)r rrVrJr%s r"rzGemmaConverter.pre_tokenizer s##C)?@@r$c d}|Srmr3rns r"rzGemmaConverter.unk_id#ror$ctjtjddtjtjgS)Nr?r)rrrC ByteFallbackFuserTs r"rzGemmaConverter.decoder's?    ,%%'    r$N) r_r`rarrhr.rrr5rrrr3r$r"rGrGs6.L'9N/ A r$rGc4eZdZdZdZdZdZdZdZdZ y) LlamaConverterTc(|jjddf|jjddf|jjddfg}||jddDcgc]}|j|jfc}z }|Scc}w)Nrrxrr:r$)r-r-rrmrr}s r"r5zLlamaConverter.vocab4s  $ $ : :1 =s C  $ $ : :1 =s C  $ $ : :1 =s C  %,,qr:JK5;; ,KK Ls)Bc d}|Sr'r3rns r"rzLlamaConverter.unk_id=ror$ctjddtjtjg}|r|tjddgz }tj |SNr?rr)contentr=rrCrWrXrBrrVrJr%sequences r"rzLlamaConverter.decoderA\   UC (  ! ! # MMO   !<= =H  **r$ct|jddrcg}t|jddr|tjdgz }|tjddgz }tj |Sy)Nr)Tr%r?)prependr)patternr_)r,r-r PrependrCr)rVrrbs r"rzLlamaConverter.normalizerKsr 4**Hd ;Ht..0BDI[00?@@ ,,S%HI IH''1 1r$ct|jdds.t||j}tj||dSy)Nr)TFrJr.split)r,r-r/r rKrLs r"rzLlamaConverter.pre_tokenizerTsAt..$?01A4CZCZ[N!++ Tbjop pr$cyr^r3rys r"rzLlamaConverter.post_processorZsr$N) r_r`rarr5rrrrrr3r$r"rZrZ1s&+ r$rZceZdZdefdZy)MarkupLMConverterr&c |j}|j}t|jj }t t ||dddd|jj}tj|j|_ tj|_ t|jj}t|jj }|jj"}|jj$}t'j(|d||d|d|||f||fg|_|S)NrFr9rz $A z $B r)r-rrfrrrr rr rr%rrrrcrrrrr rr) rVrr5rGrrrrrs r"rzzMarkupLMConverter.converted`s,  $ $ bll'')* *,#%11;;   #1":":BL_L_"` $..0 $))334$))334..;; ..;; #-#@#@U$se$5SEcU+l#l#$  r$Nrr3r$r"rmrm_s"9"r$rmc*eZdZdZddZdZdZdZy)MoshiConverterTNc t|dtj||t}|j }t |d5}|j |jddd||_y#1swY||_yxYwNrr rrurWr#r r r r r)rVr model_max_lengthkwargsrrrs r"rWzMoshiConverter.__init__sr$ +4,$%  " *d # (q  affh ' (  ( A99B c|jj}tjddg}|stj|Stjtj |g|zSrI)r@rAr rCrrDrEs r"rzMoshiConverter.normalizersg$44II   U + $'' 5 5'')@)@AU)V(WZf(fg gr$ctjddtjtjg}|r|tjddgz }tj |Sr^r`ras r"rzMoshiConverter.decoderrcr$c6d}tj||dS)Nr*Fri)r rKrLs r"rzMoshiConverter.pre_tokenizers ''KP^fkllr$r^)r_r`rarrWrrrr3r$r"rprps h+mr$rpcBeZdZdZd dZdZdZdZdZdZ d Z d Z y) HeliumConverterTNc t|dtj||t}|j }t |d5}|j |jddd||_y#1swY||_yxYwrrrs)rVr rurrrs r"rWzHeliumConverter.__init__sp$ +4,#%  " *d # (q  affh ' (  ( rvc V|j|}tt||j||j}t |j Dcgc]I\}}|jdvr6||j|jdk(xs|j|jvfK}}}|jt|dDcgc]\}}}t|d|dc}}}|jtd dd g|jd d |Scc}}wcc}}}w) Nr!r#r$c |dSr'r3r(s r"r6z+HeliumConverter.tokenizer..r)r$r7FT)r+r, single_word r*rz)rpad_id)r5rr rrr/rr2rmrr3rDrenable_padding) rVrrFrr8r9r:r;r,s r"rzHeliumConverter.tokenizers zz%(  {{5)"77  #5<<0 Avv!&&A+GD4G4G)G H  +11A~*V  &Bw5UGQUV  j%OPQ  71 =  s ADD$ cg}|jD]@}|jdk(r|d|jfgz }%||j|jfgz }B|S)Nz<0x0A>rrr}s r"r5zHeliumConverter.vocabs]\\ 6E{{h&4-..5;; 455  6  r$c d}|Sr'r3rns r"rzHeliumConverter.unk_idror$ctjddtjtjg}|tjddgz }tj |Sr^r`ras r"rzHeliumConverter.decodersY   UC (  ! ! # MMO  X^^Ca899  **r$c~tjtjdtjddgSrI)r rrgrCrs r"rzHeliumConverter.normalizers2##[%8%8%={?R?RSWY^?_$`aar$cVtjtjddgS)Nr contiguous)r rrrTs r"rzHeliumConverter.pre_tokenizers#&&(<(.0r)r$r7Fr*)r5r.r r]r/rr rr0rrr2rmrr3rDr)rVrrFr4rGr5r6rr7rr8r9r:r;r,s r"rzParakeetConverter.tokenizersMzz%( %%doo6>>|L 65>|5LMM!1MT5T1WM M ,,66"77    #5<<0 Avv!&&A+GD4G4G)G H  +11A~*V  &Bw5UGD  3N  sE2AE  E r^)r_r`rarrWrr3r$r"rrs r$rc tttdtddztttdtddzztttdtddzz}|dd}d }td D]1}||vs|j||jd |z|dz }3|Dcgc] }t |}}t t ||Scc}w) a8 Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control characters the bpe code barfs on. The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup tables between utf-8 bytes and unicode strings. !~r¡¬®ÿNr)rfrBordrCchrr@r/)bscsnbs r"bytes_to_unicoder8s U3s8SX\ *+d5TCIPQM3R.SSVZ[`adeiajloptluxyly[zV{{ AB A 4[ B; IIaL IIdQh  FA   Q#a& B  B  s C4c>eZdZdZ d dZdefdZdZdefdZ y) TikTokenConverterz' A general tiktoken converter. Nc ||_||_||_t|tr|j |_y||_yr^)r rfr% isinstancer@radditional_special_tokens)rVr rfr%rrus r"rWzTikTokenConverter.__init__UsJ% 03T: & * * , &+ &r$ tiktoken_urlc0  ddlm}|| t fd}g}i} j D]\}}||||<t |dk(r g}tdt |D]2} |d| || d} } | vs| vs| | z vs|j| | |f4t| fdd}|j|t|d d}|D cgc]} || d|| df}} ||fS#t$r tdwxYwcc} w) Nr)load_tiktoken_bpezY`tiktoken` is required to read a `tiktoken` file. Install it with `pip install tiktoken`.c dj|jdDcgc]}t|c}Scc}w)Nrzlatin-1)r.decoder)rchar byte_encoders r"token_bytes_to_stringzPTikTokenConverter.extract_vocab_merges_from_model..token_bytes_to_stringqs277@STLT3TU UTs<rc$|d|dfSr2r3)r4rs r"r6zCTikTokenConverter.extract_vocab_merges_from_model..s1Q4)AaD/0Rr$Fr>c |dS)Nr:r3r<s r"r6zCTikTokenConverter.extract_vocab_merges_from_model..s Ar$) tiktoken.loadrr1 ValueErrorrrAr;rBrCrDrE)rVrrrrGr5r;rankrJrKrLrMr=rrs @@r"extract_vocab_merges_from_modelz1TikTokenConverter.extract_vocab_merges_from_modelfsY  7 &l3 ')  V$??, !KE426E'. /5zQEq#e*- ;#(%=%-i'Gy,@gPWFW\eEeLL'7D!9: ;5&R\abE MM%  !$6F\bcUX(Q02GA2OPccf}5 k  2dsC;D;Dc|j|j\}}tt||d}t |j drd|j _|S)NF)r ignore_mergesT)rr rr rrQr)rVrFrGrs r"rzTikTokenConverter.tokenizersN#CCDOOT fc,GH 9??O 4,0IOO )r$r&c |j}tjtjt |j ddtj |jdg|_tj |_ |j|jDcgc]}t|ddc}tj d|_|Scc}w)NrFrrTr*r)rr rrrrfrr%rrrrrrr r)rVrr;s r"rzzTikTokenConverter.convertedsNN$ "0"9"9$$U4<<%8:V[\(($:O:O[`a #  %..0 $$LPLjLj k5Z% > k $.#7#7U#K   ls'C )Nzs(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+FN) r_r`rarbrWrcrrrrzr3r$r"rrPs: K"&  "C>9r$rAlbertTokenizer BartTokenizerBarthezTokenizer BertTokenizerBigBirdTokenizerBlenderbotTokenizerCamembertTokenizer CLIPTokenizerCodeGenTokenizerConvBertTokenizerDebertaTokenizerDebertaV2TokenizerDistilBertTokenizerDPRReaderTokenizerDPRQuestionEncoderTokenizerDPRContextEncoderTokenizerElectraTokenizer FNetTokenizerFunnelTokenizer GPT2TokenizerHerbertTokenizerLayoutLMTokenizerLayoutLMv2TokenizerLayoutLMv3TokenizerLayoutXLMTokenizerLongformerTokenizer LEDTokenizerLxmertTokenizerMarkupLMTokenizerMBartTokenizerMBart50TokenizerMPNetTokenizerMobileBertTokenizer MvpTokenizer NllbTokenizerOpenAIGPTTokenizerPegasusTokenizerQwen2TokenizerRealmTokenizerReformerTokenizerRemBertTokenizerRetriBertTokenizerRobertaTokenizerRoFormerTokenizerSeamlessM4TTokenizerSqueezeBertTokenizer T5Tokenizer UdopTokenizerWhisperTokenizerXLMRobertaTokenizerXLNetTokenizer)SplinterTokenizer XGLMTokenizerLlamaTokenizerCodeLlamaTokenizerGemmaTokenizer Phi3Tokenizercv|jj}|tvr!|st|}||jS tj dt |j|jjS#t$r*tdttjwxYw)a Utilities to convert a slow tokenizer instance in a fast tokenizer instance. Args: transformer_tokenizer ([`~tokenization_utils_base.PreTrainedTokenizer`]): Instance of a slow tokenizer to convert in the backend tokenizer for [`~tokenization_utils_base.PreTrainedTokenizerFast`]. from_tiktoken (bool, optional): Whether to use the `tiktoken` library to convert the tokenizer instead of sentencepiece. Defaults to False. Return: A instance of [`~tokenizers.Tokenizer`] to be used as the backend tokenizer of a [`~tokenization_utils_base.PreTrainedTokenizerFast`] zConverting from Tiktoken)r rzConverting from SentencePiece and Tiktoken failed, if a converter for SentencePiece is available, provide a model path with a SentencePiece tokenizer.model file.Currently available slow->fast converters: ) rr_SLOW_TO_FAST_CONVERTERSrzloggerinforr rr1rrfr)transformer_tokenizer from_tiktokentokenizer_class_nameconverter_classs r"convert_slow_tokenizerrs 1::CC66}12FG45??AA  KK2 3$0;;*?*Y*Yik  >>BCZC_C_Ca>b=ce  s AB3B8)r)F)Qrbrtypingr packagingr tokenizersrrrrr r r tokenizers.modelsr r rutilsrrrrutils.import_utilsr get_loggerr_rr#boolrcr/rNrPrhrsrur|rrrrrrrrrrrrRrkrvrrrrrrrrrr r rr&r*r4r7r<r?rBrGrZrmrpr{rrrrrr3r$r"rswfff55``5   H %G"$s&2"8"IcIdI$$$I$N/ /d$i$N$Y$N6%I%Py>+Y+\y:$ $Ny>~9~B" l" J  |    :  B2 \2 j | 6 L 2 < 2 , 6" \" J   | @ l %x|%xP , " L !y!H  |  (I(V$)$N): L 61 \1 h+\+\# #L&m\&mRV lV r- -b0LL^::%:(:] : ( : . :,:]: ::(:,:=:-:"=: !-!:" #:$_%:&':(]):*(+:,-:.=/:0+1:2-3:4+5:6$7:8}9::*;:<n=:>(?:@nA:B=C:D$E:F]G:H,I:J(K:LnM:NmO:P*Q:R(S:T-U:V(W:X*Y:Z0[:\M]:^;_:`]a:b(c:d.e:fng:h+"$($#s:z!)!r$