JL ifddlZddlZddlZddlZddlZddlmZddlmZddl m Z Gdde Z y)N)ZipFilePathPointer)find_dir) TokenizerIcReZdZdZd dZdZd dZdZedZ edZ dZ y ) ReppTokenizera A class for word tokenization using the REPP parser described in Rebecca Dridan and Stephan Oepen (2012) Tokenization: Returning to a Long Solved Problem - A Survey, Contrastive Experiment, Recommendations, and Toolkit. In ACL. http://anthology.aclweb.org/P/P12/P12-2.pdf#page=406 >>> sents = ['Tokenization is widely regarded as a solved problem due to the high accuracy that rulebased tokenizers achieve.' , ... 'But rule-based tokenizers are hard to maintain and their rules language specific.' , ... 'We evaluated our method on three languages and obtained error rates of 0.27% (English), 0.35% (Dutch) and 0.76% (Italian) for our best models.' ... ] >>> tokenizer = ReppTokenizer('/home/alvas/repp/') # doctest: +SKIP >>> for sent in sents: # doctest: +SKIP ... tokenizer.tokenize(sent) # doctest: +SKIP ... (u'Tokenization', u'is', u'widely', u'regarded', u'as', u'a', u'solved', u'problem', u'due', u'to', u'the', u'high', u'accuracy', u'that', u'rulebased', u'tokenizers', u'achieve', u'.') (u'But', u'rule-based', u'tokenizers', u'are', u'hard', u'to', u'maintain', u'and', u'their', u'rules', u'language', u'specific', u'.') (u'We', u'evaluated', u'our', u'method', u'on', u'three', u'languages', u'and', u'obtained', u'error', u'rates', u'of', u'0.27', u'%', u'(', u'English', u')', u',', u'0.35', u'%', u'(', u'Dutch', u')', u'and', u'0.76', u'%', u'(', u'Italian', u')', u'for', u'our', u'best', u'models', u'.') >>> for sent in tokenizer.tokenize_sents(sents): # doctest: +SKIP ... print(sent) # doctest: +SKIP ... (u'Tokenization', u'is', u'widely', u'regarded', u'as', u'a', u'solved', u'problem', u'due', u'to', u'the', u'high', u'accuracy', u'that', u'rulebased', u'tokenizers', u'achieve', u'.') (u'But', u'rule-based', u'tokenizers', u'are', u'hard', u'to', u'maintain', u'and', u'their', u'rules', u'language', u'specific', u'.') (u'We', u'evaluated', u'our', u'method', u'on', u'three', u'languages', u'and', u'obtained', u'error', u'rates', u'of', u'0.27', u'%', u'(', u'English', u')', u',', u'0.35', u'%', u'(', u'Dutch', u')', u'and', u'0.76', u'%', u'(', u'Italian', u')', u'for', u'our', u'best', u'models', u'.') >>> for sent in tokenizer.tokenize_sents(sents, keep_token_positions=True): # doctest: +SKIP ... print(sent) # doctest: +SKIP ... [(u'Tokenization', 0, 12), (u'is', 13, 15), (u'widely', 16, 22), (u'regarded', 23, 31), (u'as', 32, 34), (u'a', 35, 36), (u'solved', 37, 43), (u'problem', 44, 51), (u'due', 52, 55), (u'to', 56, 58), (u'the', 59, 62), (u'high', 63, 67), (u'accuracy', 68, 76), (u'that', 77, 81), (u'rulebased', 82, 91), (u'tokenizers', 92, 102), (u'achieve', 103, 110), (u'.', 110, 111)] [(u'But', 0, 3), (u'rule-based', 4, 14), (u'tokenizers', 15, 25), (u'are', 26, 29), (u'hard', 30, 34), (u'to', 35, 37), (u'maintain', 38, 46), (u'and', 47, 50), (u'their', 51, 56), (u'rules', 57, 62), (u'language', 63, 71), (u'specific', 72, 80), (u'.', 80, 81)] [(u'We', 0, 2), (u'evaluated', 3, 12), (u'our', 13, 16), (u'method', 17, 23), (u'on', 24, 26), (u'three', 27, 32), (u'languages', 33, 42), (u'and', 43, 46), (u'obtained', 47, 55), (u'error', 56, 61), (u'rates', 62, 67), (u'of', 68, 70), (u'0.27', 71, 75), (u'%', 75, 76), (u'(', 77, 78), (u'English', 78, 85), (u')', 85, 86), (u',', 86, 87), (u'0.35', 88, 92), (u'%', 92, 93), (u'(', 94, 95), (u'Dutch', 95, 100), (u')', 100, 101), (u'and', 102, 105), (u'0.76', 106, 110), (u'%', 110, 111), (u'(', 112, 113), (u'Italian', 113, 120), (u')', 120, 121), (u'for', 122, 125), (u'our', 126, 129), (u'best', 130, 134), (u'models', 135, 141), (u'.', 141, 142)] cp|j||_tj|_||_y)N)find_repptokenizerrepp_dirtempfile gettempdir working_direncoding)selfr rs X/mnt/ssd/data/python-lab/Trading/venv/lib/python3.12/site-packages/nltk/tokenize/repp.py__init__zReppTokenizer.__init__6s,//9 #..0  c8t|j|gS)z Use Repp to tokenize a single sentence. :param sentence: A single sentence string. :type sentence: str :return: A tuple of tokens. :rtype: tuple(str) )nexttokenize_sents)rsentences rtokenizezReppTokenizer.tokenize=sD'' 344rc#Ktjd|jdd5}|D]}|jt |dz!|j |j |j}|j|j|jj}|j|D]}|s t|\}}} | dddy#1swYyxYww)z Tokenize multiple sentences using Repp. :param sentences: A list of sentence strings. :type sentences: list(str) :return: A list of tuples of tokens :rtype: iter(tuple(str)) z repp_input.wF)prefixdirmodedelete N)r NamedTemporaryFiler writestrclosegenerate_repp_commandname_executedecoderstripparse_repp_outputszip) r sentenceskeep_token_positions input_filesentcmd repp_outputtokenized_sentstartsendss rrzReppTokenizer.tokenize_sentsHs ( ( d&6&6S  % ! 3  TT!12 3    ,,Z__=C--,33DMMBHHJK"&"9"9+"F %+363G0NFD$$  % % % %s$C,B0C  C, C)%C,ch|jdzg}|d|jdzgz }|ddgz }||gz }|S)z This module generates the REPP command to be used at the terminal. :param inputfilename: path to the input file :type inputfilename: str /src/reppz-c /erg/repp.setz--formattriple)r )r inputfilenamer.s rr#z#ReppTokenizer.generate_repp_commandbsM}}{*+ dmmo566 H%%  rctj|tjtj}|j\}}|S)N)stdoutstderr) subprocessPopenPIPE communicate)r.pr9r:s rr%zReppTokenizer._executeos2   S Q rc #0Ktjdtj}|jdD]S}|j |Dcgc]\}}}|t |t |f}}}}t d|D}|Uycc}}}ww)aZ This module parses the tri-tuple format that REPP outputs using the "--format triple" option and returns an generator with tuple of string tokens. :param repp_output: :type repp_output: type :return: an iterable of the tokenized sentences as tuples of strings :rtype: iter(tuple) z^\((\d+), (\d+), (.+)\)$z c3&K|] }|d yw)N).0ts r z3ReppTokenizer.parse_repp_outputs..s=1!A$=sN)recompile MULTILINEsplitfindallinttuple)r/ line_regexsectionstartendtokenwords_with_positionswordss rr(z ReppTokenizer.parse_repp_outputsusZZ ;R\\J "((0 'G*4););G)D$$%E3E CH-$ $=(<==E& &  '$sAB"B 2$Bctjj|r|}n t|d}tjj|dzsJtjj|dzsJ|S)zX A module to find REPP tokenizer binary and its *repp.set* config file. )REPP_TOKENIZER)env_varsr4r5)ospathexistsr)r repp_dirname _repp_dirs rr z ReppTokenizer.find_repptokenizersb 77>>, '$I 8KLIww~~i+5666ww~~i/9:::rN)utf8)F) __name__ __module__ __qualname____doc__rrrr# staticmethodr%r(r rCrrrrsI@! 5%4  ''( rr) rXrGr;sysr nltk.datarnltk.internalsrnltk.tokenize.apirrrCrrrgs-  (#(@J@r