L izddlmZmZmZmZmZmZddlmZm Z m Z m Z m Z ddl mZddlmZddlmZGddeZy ) )DictIteratorListOptionalTupleUnion) AddedToken Tokenizerdecoderspre_tokenizerstrainers)BPE)NFKC) BaseTokenizerceZdZdZ ddeeeeeeffdeeee e eeffdeee fded e d ee d ee ffd Zed edefdZdddgdgdfdeee efdedede eee fdede ede fdZdddgdgddfdeeeeeefdedede eee fdede ede deefdZxZS)SentencePieceBPETokenizerzrSentencePiece BPE Tokenizer Represents the BPE algorithm, with the pretokenization used by SentencePiece NTvocabmerges unk_token replacementadd_prefix_spacedropoutfuse_unkc ||tt|||||}ntt|||}|jt||j t|gt |_|rdnd} tj|| |_ tj|| |_ d||||d} t |5|| y)N)rrralwaysnever)rprepend_schemeSentencePieceBPE)modelrrrr)r r token_to_idstradd_special_tokensr normalizerr Metaspace pre_tokenizerr decodersuper__init__) selfrrrrrrr tokenizerr parameters __class__s r/mnt/ssd/data/python-lab/Trading/venv/lib/python3.12/site-packages/tokenizers/implementations/sentencepiece_bpe.pyr*z"SentencePieceBPETokenizer.__init__s  !3!#eVWPYdl"mnI!#gU]"^_I  Y 0 <  ( (#i.)9 :#v %57"0":":{cq"r $..;Wef ("& 0   J/vocab_filenamemerges_filenamec Ntj||\}}t||fi|S)N)r read_filer)r1r2kwargsrrs r/ from_filez#SentencePieceBPETokenizer.from_file1s( noF v(A&AAr0i0uifiles vocab_size min_frequencyspecial_tokenslimit_alphabetinitial_alphabet show_progressctj||||||}t|tr|g}|jj ||y)z%Train the model using the given filesr9r:r;r<r=r>)trainerN)r BpeTrainer isinstancer# _tokenizertrain) r+r8r9r:r;r<r=r>rAs r/rEzSentencePieceBPETokenizer.train6sP%%!'))-'   eS !GE eW5r0iteratorlengthc vtj||||||} |jj|| |y)z(Train the model using the given iteratorr@)rArGN)r rBrDtrain_from_iterator) r+rFr9r:r;r<r=r>rGrAs r/rIz-SentencePieceBPETokenizer.train_from_iteratorNsH%%!'))-'   ++  , r0)NNru▁TNF)__name__ __module__ __qualname____doc__rrr#rintrrr boolfloatr* staticmethodr6rErrI __classcell__)r.s@r/rr s7;>B,3 !%#'#(0c4S>1230sDsCx$99:;0j) 0  0  0%04.0BB#BBB 8?y"&("6S$s)^$66 6 U3 ?34 6  6s)666 8?y"&(" $  x '>>?    U3 ?34    s)    r0rN)typingrrrrrr tokenizersr r r r r tokenizers.modelsrtokenizers.normalizersrbase_tokenizerrrr0r/rYs(??PP!')] ] r0