JL iuRdZddlZddlZddlmZddlmZddlmZGddeZ y)a This is a NLTK port of the tokenizer used in the NIST BLEU evaluation script, https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/mteval-v14.pl#L926 which was also ported into Python in https://github.com/lium-lst/nmtpy/blob/master/nmtpy/metrics/mtevalbleu.py#L162 N) perluniprops) TokenizerI) xml_unescapec zeZdZdZej ddfZej ddfZej ddfZej dd fZ ej d d fZ ej d d fZ ee e e gZ e djeej"d Ze djeej"dZe djeej"dZej*ddeZej*ddeZej*ddeZej ddfZej dededd fZej dededd fZej deddfZeeeegZdZddZ ddZ y) NISTTokenizeruT This NIST tokenizer is sentence-based instead of the original paragraph-based tokenization from mteval-14.pl; The sentence-based tokenization is consistent with the other tokenizers available in NLTK. >>> from nltk.tokenize.nist import NISTTokenizer >>> nist = NISTTokenizer() >>> s = "Good muffins cost $3.88 in New York." >>> expected_lower = [u'good', u'muffins', u'cost', u'$', u'3.88', u'in', u'new', u'york', u'.'] >>> expected_cased = [u'Good', u'muffins', u'cost', u'$', u'3.88', u'in', u'New', u'York', u'.'] >>> nist.tokenize(s, lowercase=False) == expected_cased True >>> nist.tokenize(s, lowercase=True) == expected_lower # Lowercased. True The international_tokenize() is the preferred function when tokenizing non-european text, e.g. >>> from nltk.tokenize.nist import NISTTokenizer >>> nist = NISTTokenizer() # Input strings. >>> albb = u'Alibaba Group Holding Limited (Chinese: 阿里巴巴集团控股 有限公司) is a Chinese e-commerce company...' >>> amz = u'Amazon.com, Inc. (/ˈæməzɒn/) is an American electronic commerce...' >>> rkt = u'Rakuten, Inc. (楽天株式会社 Rakuten Kabushiki-gaisha) is a Japanese electronic commerce and Internet company based in Tokyo.' # Expected tokens. >>> expected_albb = [u'Alibaba', u'Group', u'Holding', u'Limited', u'(', u'Chinese', u':', u'阿里巴巴集团控股', u'有限公司', u')'] >>> expected_amz = [u'Amazon', u'.', u'com', u',', u'Inc', u'.', u'(', u'/', u'ˈæ', u'm'] >>> expected_rkt = [u'Rakuten', u',', u'Inc', u'.', u'(', u'楽天株式会社', u'Rakuten', u'Kabushiki', u'-', u'gaisha'] >>> nist.international_tokenize(albb)[:10] == expected_albb True >>> nist.international_tokenize(amz)[:10] == expected_amz True >>> nist.international_tokenize(rkt)[:10] == expected_rkt True # Doctest for patching issue #1926 >>> sent = u'this is a foo☄sentence.' >>> expected_sent = [u'this', u'is', u'a', u'foo', u'☄', u'sentence', u'.'] >>> nist.international_tokenize(sent) == expected_sent True z u
 z([\{-\~\[-\` -\&\(-\+\:-\@\/])z \1 z([^0-9])([\.,])z\1 \2 z([\.,])([^0-9])z \1 \2z ([0-9])(-)Number PunctuationSymbolz[]^\\-]z\\\g<0>z([-]+)z([z])([z])c|j\}}|j||}t|}|j\}}|j||}|S)z8Performs the language independent string substituitions.) STRIP_SKIPsubrSTRIP_EOL_HYPHEN)selftextregexp substitutions X/mnt/ssd/data/python-lab/Trading/venv/lib/python3.12/site-packages/nltk/tokenize/nist.pylang_independent_subz"NISTTokenizer.lang_independent_subsQ $ zz,-D!#44 zz,- cVt|}|j|}|r@d|zdz}|r|j}|jD]\}}|j ||}dj |j }t|j}|r|S|j SNr )strrlowerLANG_DEPENDENT_REGEXESrjoinsplitstrip)rr lowercase western_lang return_strrrs rtokenizezNISTTokenizer.tokenizes4y((. :#Dzz|(,(C(C 6$ zz,5 6xx %4::< !t3tzz|3rct|}|j\}}|j||}|j\}}|j||}t |}|r|j }|j D]\}}|j||}dj|jj}|r|S|jSr) rrrrrrINTERNATIONAL_REGEXESrrr)rrr split_non_asciir"rrs rinternational_tokenizez$NISTTokenizer.international_tokenizes4y $ zz,-#44 zz,-D! ::$> 2 FL::lD1D 2 xx **,-!t3tzz|3rN)FTF)!__name__ __module__ __qualname____doc__recompilerrPUNCTPERIOD_COMMA_PRECEEDPERIOD_COMMA_FOLLOWDASH_PRECEED_DIGITrrrsetrchars pup_number pup_punct pup_symbolr number_regex punct_regex symbol_regexNONASCIIPUNCT_1PUNCT_2SYMBOLSr%rr#r'rrrrs+\K(",J!rzz(+S0 BJJ8 97 BE%2::&89:E$"**%78*D#L1:=  RWWS!3!3!3H!=>?@JBGGC 2 2 2= ABCDIRWWS!3!3!3H!=>?@J266*j*=L"&&Z;K266*j*=Lrzz*+W4H  R ~T+b9:G  R }Db9:G bjj2l^2./8G%wA 4(GL4rr) r+ior, nltk.corpusrnltk.tokenize.apirnltk.tokenize.utilrrr>rrrCs) $(+Y4JY4r