JL izddlZddlZddlZddlZddlmZddlmZmZm Z m Z ddl m Z ddl mZdZGddeZy) N)PIPE) _java_options config_javafind_jarjava) CoreNLPParser) TokenizerIz1https://nlp.stanford.edu/software/tokenizer.shtmlcDeZdZdZdZ ddZedZdZd dZ y) StanfordTokenizeraF Interface to the Stanford Tokenizer >>> from nltk.tokenize.stanford import StanfordTokenizer >>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks." >>> StanfordTokenizer().tokenize(s) # doctest: +SKIP ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.'] >>> s = "The colour of the wall is blue." >>> StanfordTokenizer(options={"americanize": True}).tokenize(s) # doctest: +SKIP ['The', 'color', 'of', 'the', 'wall', 'is', 'blue', '.'] zstanford-postagger.jarNctjtdtdt |j |ddt ||_||_||_ |in|}djd|jD|_ y) Nzz The StanfordTokenizer will be deprecated in version 3.2.5. Please use nltk.parse.corenlp.CoreNLPParser instead.') stacklevel)STANFORD_POSTAGGER)env_vars searchpathurlverbose,c30K|]\}}|d|yw)=Nr).0keyvals \/mnt/ssd/data/python-lab/Trading/venv/lib/python3.12/site-packages/nltk/tokenize/stanford.py z-StanfordTokenizer.__init__..Es$TSuAcU^$Ts) warningswarnstrDeprecationWarningr_JAR _stanford_url _stanford_jar _encoding java_optionsjoinitems _options_cmd)self path_to_jarencodingoptionsrr%s r__init__zStanfordTokenizer.__init__%s  W   & II ,  "("WHH$TGMMO$TTc"|jS)N) splitlines)ss r_parse_tokenized_outputz)StanfordTokenizer._parse_tokenized_outputGs||~r.cJdg}|j|j||S)zW Use stanford tokenizer's PTBTokenizer to tokenize multiple sentences. z%edu.stanford.nlp.process.PTBTokenizer)r2_execute)r)r1cmds rtokenizezStanfordTokenizer.tokenizeKs(77++DMM#q,ABBr.c|j}|jd|g|j}|r|jd|jgdjt}t |j |tjdd5}t|tr|r|j|}|j||j|j|jt!||j"t$t$\}} |j'|}dddt)j*jt |dS#1swY7xYw) Nz-charsetz-options )r,rwbF)modedelete) classpathstdoutstderr)r$extendr(r&rrr%tempfileNamedTemporaryFile isinstancerencodewriteflushappendnamerr#rdecodeosunlink) r)r5input_rr+r(default_options input_filer=r>s rr4zStanfordTokenizer._executeRs >> J)*((  JJ D$5$56 7((=1 D--w? ( (d5 A -Z&#&8x0   V $     JJz '"t11$tNFF]]8,F - *//" OU; ) - -s BEE")Nutf8NFz-mx1000m)F) __name__ __module__ __qualname____doc__r!r- staticmethodr2r6r4rr.rr r sE  $D UDC!r.r )jsonrIr@r subprocessrnltk.internalsrrrrnltk.parse.corenlprnltk.tokenize.apir r"r rr.rrYs5 EE,(C ] ]r.