JL i$jddlZddlZddlmZmZmZddlmZddlm Z GddZ GddeZ y) N)IteratorListTuple) TokenizerI) align_tokensc(eZdZdZgdZddgZddgZy)MacIntyreContractionszI List of contractions adapted from Robert MacIntyre's tokenizer. )z(?i)\b(can)(?#X)(not)\bz(?i)\b(d)(?#X)('ye)\bz(?i)\b(gim)(?#X)(me)\bz(?i)\b(gon)(?#X)(na)\bz(?i)\b(got)(?#X)(ta)\bz(?i)\b(lem)(?#X)(me)\bz(?i)\b(more)(?#X)('n)\bz(?i)\b(wan)(?#X)(na)(?=\s)z(?i) ('t)(?#X)(is)\bz(?i) ('t)(?#X)(was)\bz(?i)\b(whad)(dd)(ya)\bz(?i)\b(wha)(t)(cha)\bN)__name__ __module__ __qualname____doc__ CONTRACTIONS2 CONTRACTIONS3 CONTRACTIONS4_/mnt/ssd/data/python-lab/Trading/venv/lib/python3.12/site-packages/nltk/tokenize/destructive.pyr r s& M-.FGM.0HIMrr c eZdZdZej dej dfej ddfej ddfej ddfej d ej d fgZej d ej dfej d d fej dd fej ddfej ddfej ddfgZej dej dfej ddfej ddfej dej dfej ddfej ddfej ddfej dd fej d!ej dfg Z ej d"dfZ ej d#d$fej d%d&fej d'd(fej d)d*fej d+d,fej d-d.fgZ ej d/d0fZ e Zeeej ej"Zeeej ej$Z d8d1ed2ed3ed4eefd5Zd1ed4eeeeffd6Zy7)9NLTKWordTokenizeraE The NLTK tokenizer that has improved upon the TreebankWordTokenizer. This is the method that is invoked by ``word_tokenize()``. It assumes that the text has already been segmented into sentences, e.g. using ``sent_tokenize()``. The tokenizer is "destructive" such that the regexes applied will munge the input string to a state beyond re-construction. It is possible to apply `TreebankWordDetokenizer.detokenize` to the tokenized outputs of `NLTKDestructiveWordTokenizer.tokenize` but there's no guarantees to revert to the original string. u([«“‘„]|[`]+)z \1 z^\"``z(``)z([ \(\[{<])(\"|\'{2})z\1 `` z$(?i)(\')(?!re|ve|ll|m|t|s|d|n)(\w)\bz\1 \2u ([»”’])''z '' "z\s+ z([^' ])('[sS]|'[mM]|'[dD]|') z\1 \2 z)([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) u&([^\.])(\.)([\]\)}>"\'»”’ ]*)\s*$z \1 \2 \3 z ([:,])([^\d])z \1 \2z([:,])$z\.{2,}z \g<0> z[;@#$%&]z([^\.])(\.)([\]\)}>"\']*)\s*$z\1 \2\3 z[?!]z([^'])' z\1 ' z[*]z[\]\[\(\)\{\}\<\>]z\(z-LRB-z\)z-RRB-z\[z-LSB-z\]z-RSB-z\{z-LCB-z\}z-RCB-z--z -- textconvert_parentheses return_strreturnc|rtjdtd|jD]\}}|j ||}|j D]\}}|j ||}|j \}}|j ||}|r&|jD]\}}|j ||}|j\}}|j ||}d|zdz}|jD]\}}|j ||}|jD]}|j d|}|jD]}|j d|}|jS)aReturn a tokenized copy of `text`. >>> from nltk.tokenize import NLTKWordTokenizer >>> s = '''Good muffins cost $3.88 (roughly 3,36 euros)\nin New York. Please buy me\ntwo of them.\nThanks.''' >>> NLTKWordTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE ['Good', 'muffins', 'cost', '$', '3.88', '(', 'roughly', '3,36', 'euros', ')', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks', '.'] >>> NLTKWordTokenizer().tokenize(s, convert_parentheses=True) # doctest: +NORMALIZE_WHITESPACE ['Good', 'muffins', 'cost', '$', '3.88', '-LRB-', 'roughly', '3,36', 'euros', '-RRB-', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks', '.'] :param text: A string with a sentence or sentences. :type text: str :param convert_parentheses: if True, replace parentheses to PTB symbols, e.g. `(` to `-LRB-`. Defaults to False. :type convert_parentheses: bool, optional :param return_str: If True, return tokens as space-separated string, defaults to False. :type return_str: bool, optional :return: List of tokens from `text`. :rtype: List[str] zHParameter 'return_str' has been deprecated and should no longer be used.)category stacklevelrz \1 \2 ) warningswarnDeprecationWarningSTARTING_QUOTESsub PUNCTUATIONPARENS_BRACKETSCONVERT_PARENTHESES DOUBLE_DASHES ENDING_QUOTESrrsplit)selfrrrregexp substitutions rtokenizezNLTKWordTokenizer.tokenizeys{8  MM"+  %)$8$8 2 FL::lD1D 2%)$4$4 2 FL::lD1D 2 $33 zz,- (,(@(@ 6$ zz,5 6 $11 zz,-TzC$($6$6 2 FL::lD1D 2(( 0F::j$/D 0(( 0F::j$/D 0zz|rc#.K|j|}d|vsd|vrVtjd|Dcgc]}|j}}|Dcgc]}|dvr|j dn|}}n|}t ||Ed{ycc}wcc}w7w)a} Returns the spans of the tokens in ``text``. Uses the post-hoc nltk.tokens.align_tokens to return the offset spans. >>> from nltk.tokenize import NLTKWordTokenizer >>> s = '''Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks).''' >>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23), ... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38), ... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59), ... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)] >>> list(NLTKWordTokenizer().span_tokenize(s)) == expected True >>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in', ... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')', ... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.'] >>> [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected True :param text: A string with a sentence or sentences. :type text: str :yield: Tuple[int, int] rrz ``|'{2}|\")rrrrN)r0refinditergrouppopr)r-r raw_tokensmmatchedtoktokenss r span_tokenizezNLTKWordTokenizer.span_tokenizes.]]4( 4KTT\*,++mT*JKQqwwyKGK &#&):": ACF  F---L .s(2BB  BB/BBBN)FF)r r r r r2compileUr%r+r'r(r)r*r _contractionslistmaprrstrboolrr0rrintr;rrrrr%s  *BDD 17; F U# G g& , -y9 ;RTT BHM O NBDD )73 E F# D 6" F S! 4 5yA @ A9M M( Dbdd K\Z $ %y1 J ) BJJy"$$ '   K *- BJJ7 8   G j) K (+ BJJvrtt $  K,"rzz"78*EO E G$ E G$ E G$ E G$ E G$ E G$  RZZ&0M*+MRZZ)D)DEFMRZZ)D)DEFMPUFF.2FHLF cFP).#).(5c?*C).rr) r2r"typingrrrnltk.tokenize.apirnltk.tokenize.utilrr rrrrrGs3 (((+JJ&E. E.r