JL i]%vddlZddlZddlZddlZddlmZddlmZmZm Z m Z m Z m Z ddl mZdZGddeZy)N)PIPE) _java_options config_javafind_dir find_filefind_jarjava) TokenizerIz!https://nlp.stanford.edu/softwarecfeZdZdZdZ d dZdZfdZdZdZ dZ d d Z xZ S) StanfordSegmenteru[Interface to the Stanford Segmenter If stanford-segmenter version is older than 2016-10-31, then path_to_slf4j should be provieded, for example:: seg = StanfordSegmenter(path_to_slf4j='/YOUR_PATH/slf4j-api.jar') >>> from nltk.tokenize.stanford_segmenter import StanfordSegmenter >>> seg = StanfordSegmenter() # doctest: +SKIP >>> seg.default_config('zh') # doctest: +SKIP >>> sent = u'这是斯坦福中文分词器测试' >>> print(seg.segment(sent)) # doctest: +SKIP 这 是 斯坦福 中文 分词器 测试 >>> seg.default_config('ar') # doctest: +SKIP >>> sent = u'هذا هو تصنيف ستانفورد العربي للكلمات' >>> print(seg.segment(sent.split())) # doctest: +SKIP هذا هو تصنيف ستانفورد العربي ل الكلمات zstanford-segmenter.jarc Ttjdttjt dtdtjdtt |j |ddt| } |t d |d dt| }nd}tjjd | |fD|_ ||_ ||_ ||_||_||_||_| |_| |_| in| } d jd | j)D|_y)Nalwaysz} The StanfordTokenizer will be deprecated in version 3.2.5. Please use nltk.parse.corenlp.CoreNLPTokenizer instead.') stacklevelignoreSTANFORD_SEGMENTER)env_vars searchpathurlverbosez slf4j-api.jar)SLF4Jrc3&K|] }|| ywNr).0_s f/mnt/ssd/data/python-lab/Trading/venv/lib/python3.12/site-packages/nltk/tokenize/stanford_segmenter.py z-StanfordSegmenter.__init__..js- amA- s,c3VK|]!\}}|dtj|#yw)=N)jsondumps)rkeyvals rrz-StanfordSegmenter.__init__..xs-% +33se1TZZ_% &% s'))warnings simplefilterDeprecationWarningwarnstrr_JAR _stanford_urlospathsepjoin _stanford_jar _java_class_model_sihan_corpora_dict_sihan_post_processing_keep_whitespaces_dict _encoding java_optionsitems _options_cmd)self path_to_jar path_to_slf4j java_class path_to_model path_to_dictpath_to_sihan_corpora_dictsihan_post_processingkeep_whitespacesencodingoptionsrr9stanford_segmenterslf4js r__init__zStanfordSegmenter.__init__8s- h(:; Z    h(:;% II ,    $8! EE ZZ__- *E2-  &# #= &;#!1! !("WHH% 7>}}%  cd}tjjdr>tjj tjjddh}d|_d|_d|_|dk(r d|_d}n{|d k(rhd |_d }d |_d } t||tdd|_d} t|tdd}tjj |||_ntd| t||tdd|_ y#t$r}td|z|d}~wwxYw#t$r}td|z|d}~wwxYw#t$r}td|z|d}~wwxYw)z Attempt to initialize Stanford Word Segmenter for the specified language using the STANFORD_SEGMENTER and STANFORD_MODELS environment variables rrdataNfalsearz=edu.stanford.nlp.international.arabic.process.ArabicSegmenterz'arabic-segmenter-atb+bn+arztrain.ser.gzzhz%edu.stanford.nlp.ie.crf.CRFClassifierzpku.gztruezdict-chris6.ser.gzF)STANFORD_MODELS)rrrrz_Could not find '%s' (tried using env. variables STANFORD_MODELS and /data/)z./data/r)rrrzMCould not find '%s' (tried using the STANFORD_SEGMENTER environment variable)zUnsupported language )rQr)r.environgetpathr0r7r4r5r2rr- LookupErrorrr3)r<lang search_pathmodelrAe sihan_dirpath_to_sihan_dirs rdefault_configz StanfordSegmenter.default_config|s  ::>>. /77<< 7K(LfUVK #' &-# 4<O  >E T\FD E*0D '/L & *%!1  "I $,%!4 %! ,.77<<8I9+U( 5dV<= = #&!B DK3 !P"#    !?AJK   LNST  sHD88D:E D7#D22D7: EEE E7#E22E7c$t||yr)supertokenize)r<s __class__s rr_zStanfordSegmenter.tokenizes rJc|jd|jd|jd|g}|j5|j d|j d|jd|j g|j|}|S) -loadClassifier-keepAllWhitespaces -textFile-serDictionary-sighanCorporaDict-sighanPostProcessing)r2r3r6r4extendr7r5_execute)r<input_file_pathcmdstdouts r segment_filezStanfordSegmenter.segment_files     KK !  " "     # # / JJ$JJ(,,+//  s# rJc&|j|gSr) segment_sents)r<tokenss rsegmentzStanfordSegmenter.segments!!6(++rJc|j}tjd\}|_t j |d}dj d|D}t|tr|r|j|}|j||j|jd|jd|jd|jg}|j5|j!d |j"d |jd |j$g|j'|}t j(|j|S) rcT)textwb c3>K|]}dj|yw)rcN)r0)rxs rrz2StanfordSegmenter.segment_sents..s:1388A;:srdrerfrgrhri)r8tempfilemkstemp_input_file_pathr.fdopenr0 isinstancer+encodewritecloser2r3r6r4rjr7r5rkunlink)r< sentencesrE _input_fh_inputrmrns rrqzStanfordSegmenter.segment_sentss>>+3+;+;+F( 4(IIi. : :: fc "x]]8,F     KK !  " "   ! !   # # / JJ$JJ(,,+//  s# $''( rJcv|j}|jd|g|j}|r|jd|jgdjt}t |j |t||jtt\}}|j|}t |d|S)Nz-inputEncodingz-optionsrc)rFr) classpathrnstderrF) r8rjr;r0rrr9r r1rdecode)r<rmrrEr;default_optionsrn_stderrs rrkzStanfordSegmenter._executes>> $h/0((  JJ D$5$56 7((=1 D--w? 4--d4 x( OU; rJ) NNNNNNrMrMzUTF-8NFz-mx2g)F) __name__ __module__ __qualname____doc__r,rIr\r_rorsrqrk __classcell__)ras@rr r s]* $D#'% B HGR6,(TrJr )r#r.rzr' subprocessrnltk.internalsrrrrrr nltk.tokenize.apir r-r rrJrrs8 )3 D DrJ