"""This file is based on pattern.en. See the bundled NOTICE file for license information. """ import os from textblob._text import CHUNK, PENN, PNP, POS, UNIVERSAL, WORD, Lexicon, Spelling from textblob._text import Parser as _Parser from textblob._text import Sentiment as _Sentiment try: MODULE = os.path.dirname(os.path.abspath(__file__)) except: MODULE = "" spelling = Spelling(path=os.path.join(MODULE, "en-spelling.txt")) # --- ENGLISH PARSER -------------------------------------------------------------------------------- def find_lemmata(tokens): """Annotates the tokens with lemmata for plural nouns and conjugated verbs, where each token is a [word, part-of-speech] list. """ for token in tokens: word, pos, lemma = token[0], token[1], token[0] # cats => cat if pos == "NNS": lemma = singularize(word) # sat => sit if pos.startswith(("VB", "MD")): lemma = conjugate(word, INFINITIVE) or word token.append(lemma.lower()) return tokens class Parser(_Parser): def find_lemmata(self, tokens, **kwargs): return find_lemmata(tokens) def find_tags(self, tokens, **kwargs): if kwargs.get("tagset") in (PENN, None): kwargs.setdefault("map", lambda token, tag: (token, tag)) if kwargs.get("tagset") == UNIVERSAL: kwargs.setdefault( "map", lambda token, tag: penntreebank2universal(token, tag) ) return _Parser.find_tags(self, tokens, **kwargs) class Sentiment(_Sentiment): def load(self, path=None): _Sentiment.load(self, path) # Map "terrible" to adverb "terribly" (+1% accuracy) if not path: for w, pos in list(dict.items(self)): if "JJ" in pos: if w.endswith("y"): w = w[:-1] + "i" if w.endswith("le"): w = w[:-2] p, s, i = pos["JJ"] self.annotate(w + "ly", "RB", p, s, i) lexicon = Lexicon( path=os.path.join(MODULE, "en-lexicon.txt"), morphology=os.path.join(MODULE, "en-morphology.txt"), context=os.path.join(MODULE, "en-context.txt"), entities=os.path.join(MODULE, "en-entities.txt"), language="en", ) parser = Parser(lexicon=lexicon, default=("NN", "NNP", "CD"), language="en") sentiment = Sentiment( path=os.path.join(MODULE, "en-sentiment.xml"), synset="wordnet_id", negations=("no", "not", "n't", "never"), modifiers=("RB",), modifier=lambda w: w.endswith("ly"), tokenizer=parser.find_tokens, language="en", ) def tokenize(s, *args, **kwargs): """Returns a list of sentences, where punctuation marks have been split from words.""" return parser.find_tokens(str(s), *args, **kwargs) def parse(s, *args, **kwargs): """Returns a tagged str string.""" return parser.parse(str(s), *args, **kwargs) def parsetree(s, *args, **kwargs): """Returns a parsed Text from the given string.""" return Text(parse(str(s), *args, **kwargs)) def split(s, token=None): """Returns a parsed Text from the given parsed string.""" if token is None: token = [WORD, POS, CHUNK, PNP] return Text(str(s), token) def tag(s, tokenize=True, encoding="utf-8"): """Returns a list of (token, tag)-tuples from the given string.""" tags = [] for sentence in parse(s, tokenize, True, False, False, False, encoding).split(): for token in sentence: tags.append((token[0], token[1])) return tags def suggest(w): """Returns a list of (word, confidence)-tuples of spelling corrections.""" return spelling.suggest(w) def polarity(s, **kwargs): """Returns the sentence polarity (positive/negative) between -1.0 and 1.0.""" return sentiment(str(s), **kwargs)[0] def subjectivity(s, **kwargs): """Returns the sentence subjectivity (objective/subjective) between 0.0 and 1.0.""" return sentiment(str(s), **kwargs)[1] def positive(s, threshold=0.1, **kwargs): """Returns True if the given sentence has a positive sentiment (polarity >= threshold).""" return polarity(str(s), **kwargs) >= threshold