JL inZdZddlZddlZddlZddlmZmZmZddlm Z ddl m Z ddl m Z ddlmZddlmZdd lmZmZdd lmZdd lmZdd lmZdd lmZmZmZedgdZGddZ GddZ!GddZ"GddZ#Gdde#Z$dZ%e&dk(re%gdZ'y)a  This module brings together a variety of NLTK functionality for text analysis, and provides simple, interactive interfaces. Functionality includes: concordancing, collocation discovery, regular expression search over tokenized strings, and distributional similarity. N)Counter defaultdict namedtuple)reduce)log)BigramCollocationFinder)MLE)padded_everygram_pipeline)BigramAssocMeasures f_measure)ConditionalFreqDist)FreqDist) sent_tokenize)LazyConcatenation cut_string tokenwrapConcordanceLine)leftqueryrightoffset left_print right_printlinecLeZdZdZedZdddfdZdZdZd dZ d d Z y) ContextIndexa A bidirectional index between words and their 'contexts' in a text. The context of a word is usually defined to be the words that occur in a fixed window around the word; but other definitions may also be used by providing a custom context function. c|dk7r||dz jnd}|t|dz k7r||dzjnd}||fS)z;One left token and one right token, normalized to lowercaser*START**END*)lowerlen)tokensirrs O/mnt/ssd/data/python-lab/Trading/venv/lib/python3.12/site-packages/nltk/text.py_default_contextzContextIndex._default_context/sS)*Qva!e}""$I)*c&kAo)=q1u ##%7e}Nc|SNxs r%zContextIndex.6sQr'c&|__|r|_nj_|rDcgc] }||s |c}t fdt D_t fdt D_ycc}w)Nc3jK|]*\}}j|j|f,ywr))_key _context_func.0r$wselfr#s r% z(ContextIndex.__init__..?s4% >BaTYYq\4--fa8 9% 03c3jK|]*\}}j|j|f,ywr))r1r0r2s r%r6z(ContextIndex.__init__..Bs4% >BaT   *DIIaL 9% r7)r0_tokensr1r&CFD enumerate_word_to_contexts_context_to_words)r5r# context_funcfilterkeyts`` r%__init__zContextIndex.__init__6s  !-D !%!6!6D  !'5A6!9a5F!$% FOPVFW% " "%% FOPVFW% "  6s BBc|jS)zw :rtype: list(str) :return: The document that this context index was created from. r9r5s r%r#zContextIndex.tokensF ||r'c|j|}t|j|}i}|jjD]\}}t |t|||<|S)z Return a dictionary mapping from words to 'similarity scores,' indicating how often these two words occur in the same context. )r0setr<itemsr )r5word word_contextsscoresr4 w_contextss r%word_similarity_dictz!ContextIndex.word_similarity_dictNsj yyD22489 !3399; BMAz!-ZAF1I B r'c0tt}|j|j|D]L}|j|D]8}||k7s ||xx|j|||j||zz cc<:Nt ||j dd|S)NT)r@reverse)rintr<r0r=sortedget)r5rJnrLcr4s r% similar_wordszContextIndex.similar_words]sS!'' $8 A++A. 91I..q1$7$:P:PQR:STU:VVI   f&**d;BQ??r'c|Dcgc]}j|}}|Dcgc]}tj|}}tt |Dcgc] }||r ||}}t tj ||r|rtddj|s tStfd|D}|Scc}wcc}wcc}w)a Find contexts where the specified words can all appear; and return a frequency distribution mapping each context to the number of times that context was used. :param words: The words used to seed the similarity search :type words: str :param fail_on_unknown: If true, then raise a value error if any of the given words do not occur at all in the index. z%The following word(s) were not found: c3TK|]}j|D] }|vs| !ywr))r<)r3r4rUcommonr5s r%r6z/ContextIndex.common_contexts..|s9$*@*@*C%&qF{s( () r0rHr<ranger"r intersection ValueErrorjoinr) r5wordsfail_on_unknownr4contextsr$emptyfdrZs ` @r%common_contextszContextIndex.common_contextsgs(--!1-- -1;  @r'rc>eZdZdZdfdZdZdZdZd dZd dZ y ) ConcordanceIndexzs An index that can be used to look up the offset locations at which a given word occurs in a document. c|Sr)r*r+s r%r-zConcordanceIndex.sQr'c||_ ||_ tt|_ t |D]4\}}|j|}|j|j |6y)a Construct a new concordance index. :param tokens: The document (list of tokens) that this concordance index was created from. This list can be used to access the context of a given word occurrence. :param key: A function that maps each token to a normalized version that will be used as a key in the index. E.g., if you use ``key=lambda s:s.lower()``, then the index will be case-insensitive. N)r9r0rlist_offsetsr;append)r5r#r@indexrJs r%rBzConcordanceIndex.__init__sg   D#D) L$V, .KE499T?D MM$  & &u - .r'c|jS)z{ :rtype: list(str) :return: The document that this concordance index was created from. rDrEs r%r#zConcordanceIndex.tokensrFr'cB|j|}|j|S)z :rtype: list(int) :return: A list of the offset positions at which the given word occurs. If a key function was specified for the index, then given word's key will be looked up. )r0rqr5rJs r%offsetszConcordanceIndex.offsetss yy}}T""r'c\dt|jt|jfzS)Nz+)r"r9rqrEs r%__repr__zConcordanceIndex.__repr__s-<    @   r'c Ht|tr|}n|g}dj|}td|D}||z dz dz}|dz}g}|j |d} t |ddD]C\} }|j |D chc] } | | z dz  } } t | j| } E| r| D]} dj|j| | t|z} |jtd| |z | }|j| t|z| |z}tdj|| j|}tdj||}dj|| |g}t|| || |||}|j||Scc} w)z Find all concordance lines given the query word. Provided with a list of words, these will be found as a phrase. rXc3LK|]}tj|rdyw)rN) unicodedata combining)r3chars r%r6z4ConcordanceIndex.find_concordance..sUt9N9Nt9TUs$$rrN) isinstancerpr^sumrwr;rRr\r9r"maxrrjustrrr)r5rJwidthphrase phrase_str phrase_len half_widthcontextconcordance_listrwr$r word_offsets query_word left_context right_contextrr line_printconcordance_lines r%find_concordancez!ConcordanceIndex.find_concordances dD !FVFXXf% UzUU j(1,2 1*,,vay) , AGAt9=d9KLvFQJNLLL\66w?@G A  : XXdll1q3v;&GH #||C1w;,?!D $ QV_q7{ K '(> LRR )-)@*M  XXz:{&KL #2 !$ !''(89- :. 5MsFc|j||}|s tdyt|t|}td|dt|dt |d|D]\}}t|j y)a Print concordance lines given the query word. :param word: The target word or phrase (a list of strings) :type word: str or list :param lines: The number of lines to display (default=25) :type lines: int :param width: The width of each line, in characters (default=80) :type width: int :param save: The option to save the concordance. :type save: bool )rz no matchesz Displaying z of z matches:N)rprintminr"r;r)r5rJrlinesrr$rs r%print_concordancez"ConcordanceIndex.print_concordances 00U0C , s#345E Kwd3/?+@*AK L'01A&51I'J -##&++, -r'N)P)r) rgrhrirjrBr#rwryrrr*r'r%rmrms+ $/.4# . `-r'rmceZdZdZdZdZy) TokenSearchera A class that makes it easier to use regular expressions to search over tokenized strings. The tokenized string is converted to a string where tokens are marked with angle brackets -- e.g., ``''``. The regular expression passed to the ``findall()`` method is modified to treat angle brackets as non-capturing parentheses, in addition to matching the token boundaries; and to have ``'.'`` not match the angle brackets. c>djd|D|_y)Nc3,K|] }d|zdzyw)<>Nr*)r3r4s r%r6z)TokenSearcher.__init__.. s:aC!GcM:s)r^_raw)r5r#s r%rBzTokenSearcher.__init__ sGG:6:: r'ctjdd|}tjdd|}tjdd|}tjdd|}tj||j}|D]0}|j dr|j ds't d |Dcgc]}|d d jd }}|Scc}w) a Find instances of the regular expression in the text. The text is a list of tokens, and a regexp pattern to match a single token must be surrounded by angle brackets. E.g. >>> from nltk.text import TokenSearcher >>> from nltk.book import text1, text5, text9 >>> text5.findall("<.*><.*>") you rule bro; telling you bro; u twizted bro >>> text1.findall("(<.*>)") monied; nervous; dangerous; white; white; white; pious; queer; good; mature; white; Cape; great; wise; wise; butterless; white; fiendish; pale; furious; better; certain; complete; dismasted; younger; brave; brave; brave; brave >>> text9.findall("{3,}") thread through those; the thought that; that the thing; the thing that; that that thing; through these than through; them that the; through the thick; them that they; thought that the :param regexp: A regular expression :type regexp: str z\srrz(?:<(?:rz)>)z (?]z$Bad regexp for TokenSearcher.findallrz><)resubfindallr startswithendswithr]splitr5regexphitshs r%rzTokenSearcher.findalls0r6*i0eV, ff5zz&$)), IA<<$C !GHH I .22!B d#22 3s6CN)rgrhrirjrBrr*r'r%rrs;'r'rceZdZdZdZddZdZdZddZddZ dd Z dd Z d Z d Z d ZddZddZdZddZddZdZdZdZej0dZdZdZdZy) Texta A wrapper around a sequence of simple (string) tokens, which is intended to support initial exploration of texts (via the interactive console). Its methods perform a variety of analyses on the text's contexts (e.g., counting, concordancing, collocation discovery), and display the results. If you wish to write a program which makes use of these analyses, then you should bypass the ``Text`` class, and use the appropriate analysis function or class directly instead. A ``Text`` is typically initialized from a given document or corpus. E.g.: >>> import nltk.corpus >>> from nltk.text import Text >>> moby = Text(nltk.corpus.gutenberg.words('melville-moby_dick.txt')) TNc|jr t|}||_|r||_yd|ddvr5|ddj d}dj d|d|D|_ydj d|ddDd z|_y) zv Create a Text object. :param tokens: The source text. :type tokens: sequence of str ]NrfrXc32K|]}t|ywr)strr3toks r%r6z Text.__init__..bs CcS Crc32K|]}t|ywr)rrs r%r6z Text.__init__..ds @cS @rz...) _COPY_TOKENSrpr#namersr^)r5r#rends r%rBz Text.__init__Ss   &\F DI F3BK "+##C(C CVAc] CCDI @VBQZ @@5HDIr'c |j|Sr))r#)r5r$s r% __getitem__zText.__getitem__js{{1~r'c,t|jSr))r"r#rEs r%__len__z Text.__len__ms4;;r'cd|jvrt|jd|_|jj |||S)a Prints a concordance for ``word`` with the specified context window. Word matching is not case-sensitive. :param word: The target word or phrase (a list of strings) :type word: str or list :param width: The width of each line, in characters (default=80) :type width: int :param lines: The number of lines to display (default=25) :type lines: int :seealso: ``ConcordanceIndex`` _concordance_indexc"|jSr)r!ss r%r-z"Text.concordance.. 1779r'r@)__dict__rmr#rrr5rJrrs r% concordancezText.concordancetsD t}} 4&6 !4'D #&&88ueLLr'cd|jvrt|jd|_|jj ||d|S)a Generate a concordance for ``word`` with the specified context window. Word matching is not case-sensitive. :param word: The target word or phrase (a list of strings) :type word: str or list :param width: The width of each line, in characters (default=80) :type width: int :param lines: The number of lines to display (default=25) :type lines: int :seealso: ``ConcordanceIndex`` rc"|jSr)rrs r%r-z'Text.concordance_list..rr'rN)rrmr#rrrs r%rzText.concordance_listsI t}} 4&6 !4'D #&&77eDVeLLr'cd|jvr|j|k(r|j|k(s||_||_ddlm}|j dt j|j|}|jd|jfdt}t|j|j||_|jS)a Return collocations derived from the text, ignoring stopwords. >>> from nltk.book import text4 >>> text4.collocation_list()[:2] [('United', 'States'), ('fellow', 'citizens')] :param num: The maximum number of collocations to return. :type num: int :param window_size: The number of tokens spanned by a collocation (default=2) :type window_size: int :rtype: list(tuple(str, str)) _collocationsr) stopwordsenglishrcHt|dkxs|jvS)N)r"r!)r4 ignored_wordss r%r-z'Text.collocation_list..ss1vz/WQWWY-=Wr')r_num _window_size nltk.corpusrr_r from_wordsr#apply_freq_filterapply_word_filterr rpnbestlikelihood_ratior)r5num window_sizerfinderbigram_measuresrs @r%collocation_listzText.collocation_lists t}} , S !![0DI +D  .%OOI6M,77 [QF  $ $Q '  $ $%W X13O!% _==sC"D !!!r'c|j||Dcgc] \}}|dz|z}}}tt|dycc}}w)a Print collocations derived from the text, ignoring stopwords. >>> from nltk.book import text4 >>> text4.collocations() # doctest: +NORMALIZE_WHITESPACE United States; fellow citizens; years ago; four years; Federal Government; General Government; Vice President; American people; God bless; Chief Justice; one another; fellow Americans; Old World; Almighty God; Fellow citizens; Chief Magistrate; every citizen; Indian tribes; public debt; foreign nations :param num: The maximum number of collocations to print. :type num: int :param window_size: The number of tokens spanned by a collocation (default=2) :type window_size: int rX; ) separatorN)rrr)r5rrw1w2collocation_stringss r% collocationszText.collocationssO()-(=(=c;(O $b"BHrM   i+t<= sAc8|jj|S)zJ Count the number of times this word appears in the text. )r#countrvs r%rz Text.count{{  &&r'c8|jj|S)zQ Find the index of the first occurrence of the word in the text. )r#rsrvs r%rsz Text.indexrr'ctr))NotImplementedError)r5methods r% readabilityzText.readabilitys!!r'cd|jvrt|jdd|_j |jj j vrjttfdj D}|j|Dcgc]\}}| }}}tt|ytdycc}}w)a~ Distributional similarity: find other words which appear in the same contexts as the specified word; list most similar words first. :param word: The word used to seed the similarity search :type word: str :param num: The number of words to generate (default=20) :type num: int :seealso: ContextIndex.similar_words() _word_context_indexc"|jSr))isalphar+s r%r-zText.similar..s aiikr'c"|jSr)rrs r%r-zText.similar..s r')r?r@c3HK|]}|D]}|vr |k(s|ywr)r*)r3r4rUrawcirJs r%r6zText.similar..s?Q=ds"z No matchesN) rrr#rr!r< conditionsrHr most_commonrr) r5rJrrcr4_r_rars ` @@r%similarz Text.similars ! 5'3 $9?R(D $ zz|&&88 3>># #3t9~H)B $&>>##6741aQ7E7 )E" # , 8s/ Cczd|jvrt|jd|_ |jj |d}|s t dy|j |Dcgc]\}}| }}}t td|Dycc}}w#t$r}t |Yd}~yd}~wwxYw)aY Find contexts where the specified words appear; list most frequent common contexts first. :param words: The words used to seed the similarity search :type words: str :param num: The number of words to generate (default=20) :type num: int :seealso: ContextIndex.common_contexts() rc"|jSr)rrs r%r-z&Text.common_contexts..rr'rTzNo common contexts were foundc32K|]\}}|dz|zyw)rNr*)r3rrs r%r6z'Text.common_contexts..!sL&"bS2 LrN) rrr#rrdrrrr])r5r_rrcr4rranked_contextses r%rdzText.common_contexts s ! 5'3 !4(D $ ))99%FB56131D"EA1"E"EiLOLLM#F  !HH s/)BB+ B7BB B:% B55B:c"ddlm}|||y)z Produce a plot showing the distribution of the words through the text. Requires pylab to be installed. :param words: The words to be plotted :type words: list(str) :seealso: nltk.draw.dispersion_plot() r)dispersion_plotN) nltk.drawr)r5r_rs r%rzText.dispersion_plot&s .e$r'c`t||\}}t|}|j|||S)N)order)r r fit)r5tokenized_sentsrT train_data padded_sentsmodels r%_train_default_ngram_lmzText._train_default_ngram_lm3s/#zrrN)rr^r#r_tokenized_sentshasattrrsysstderrr r r"r;generaterrr) r5lengthrrsentgenerated_tokensidxtokenprefix output_strs r%rz Text.generate9sX")6chht{{6K(L! $DJJsO! t-. +#** ="&">">%%#?#D z>>>z"#f,'##,,i[- / U E>F? ''. / 1 K"#f,/8)$s*Ri(8&(ABB  j9! sEc<|jj|S)zc See documentation for FreqDist.plot() :seealso: nltk.prob.FreqDist.plot() )vocabplot)r5argss r%rz Text.plotgs !tzz|  $''r'cVd|jvrt||_|jS)z. :seealso: nltk.prob.FreqDist _vocab)rrr"rEs r%rz Text.vocabns% 4== ("4.DK{{r'cd|jvrt||_|jj|}|Dcgc]}dj |}}t t |dycc}w)a Find instances of the regular expression in the text. The text is a list of tokens, and a regexp pattern to match a single token must be surrounded by angle brackets. E.g. >>> from nltk.book import text1, text5, text9 >>> text5.findall("<.*><.*>") you rule bro; telling you bro; u twizted bro >>> text1.findall("(<.*>)") monied; nervous; dangerous; white; white; white; pious; queer; good; mature; white; Cape; great; wise; wise; butterless; white; fiendish; pale; furious; better; certain; complete; dismasted; younger; brave; brave; brave; brave >>> text9.findall("{3,}") thread through those; the thought that; that the thing; the thing that; that that thing; through these than through; them that the; through the thick; them that they; thought that the :param regexp: A regular expression :type regexp: str _token_searcherrXrN)rrr$rr^rrrs r%rz Text.findallwsb. DMM 1#0#6D ##++F3%)* ** id#$+sA.z \w+|[\.\!\?]c|dz }|dk\rG|jj||s)|dz}|dk\r|jj||s)|dk7r||nd}|dz}|t|krP|jj||s2|dz }|t|kr|jj||s2|t|k7r||nd}||fS)z One left & one right token, both case-normalized. Skip over non-sentence-final punctuation. Used by the ``ContextIndex`` that is created for ``similar()`` and ``common_contexts()``. rrrr ) _CONTEXT_REmatchr")r5r#r$jrrs r%_contextz Text._contexts E1fT--33F1I> FA1fT--33F1I>Fvay  E#f+od&6&6&<&rrEs r%__str__z Text.__str__dii''r'c d|jzSr+r,rEs r%ryz Text.__repr__r.r'r))Or)rfrre)r)dN*)rgrhrirjrrBrrrrrrrrsrrrdrr rrrrrcompiler&r)r-ryr*r'r%rr9s.LI. M*M(!"F>0' ' "  D8 % ,\(%D"**_-K0((r'rc(eZdZdZdZdZdZdZy)TextCollectiona;A collection of texts, which can be loaded with list of texts, or with a corpus consisting of one or more texts, and which supports counting, concordancing, collocation discovery, etc. Initialize a TextCollection as follows: >>> import nltk.corpus >>> from nltk.text import TextCollection >>> from nltk.book import text1, text2, text3 >>> gutenberg = TextCollection(nltk.corpus.gutenberg) >>> mytexts = TextCollection([text1, text2, text3]) Iterating over a TextCollection produces all the tokens of all the texts in order. ct|dr,|jDcgc]}|j|}}||_tj |t |i|_ycc}w)Nr_)rfileidsr__textsrrBr _idf_cache)r5sourcefs r%rBzTextCollection.__init__sV 67 #/5~~/?@!fll1o@F@  d-f56 AsA'c<|j|t|z S)z"The frequency of the term in text.)rr"r5termtexts r%tfzTextCollection.tfszz$#d)++r'cH|jj|}|t|jDcgc] }||vsd c}}t|jdk(r t d|r!t t|j|z nd}||j|<|Scc}w)zThe number of texts in the corpus divided by the number of texts that the term appears in. If a term does not appear in the corpus, 0.0 is returned.Trz+IDF undefined for empty document collectiong)r9rSr"r8r]r)r5r>idfr?matchess r%rBzTextCollection.idfs oo!!$' ;DKKHD44<4HIG4;;1$ !NOO5<#c$++&01#C$'DOOD ! Is BBcJ|j|||j|zSr))r@rBr=s r%tf_idfzTextCollection.tf_idfs wwtT"TXXd^33r'N)rgrhrirjrBr@rBrEr*r'r%r5r5s , 4r'r5czddlm}t|jd}t |t t d|j dt t d|j dt t d|jt t d|jgd t t d |jd t t d t d |dt d|ddt d|jdy)Nr)brownnews) categoriesz Concordance:zDistributionally similar words:z Collocations:zDispersion plot:)rHreportsaid announcedzVocabulary plot:2z Indexing:ztext[3]:rz text[3:5]:ztext.vocab()['news']:) rrGrr_rrrrrrr)rGr?s r%demorOs!  v . /D $K G .V G +,LL G / G @A G IIbM G + *d1g ,Qq " !4::<#78r'__main__)rrmrrr5)(rjrrr| collectionsrrr functoolsrmathrnltk.collocationsrnltk.lmr nltk.lm.preprocessingr nltk.metricsr r nltk.probabilityr r:r nltk.tokenizer nltk.utilrrrrrrmrrr5rOrg__all__r*r'r%r\s 885;77%'>>M XXv|-|-~55p~(~(D +4T+4\9< zF r'