JL izddlZddlZddlmZddlmZddlmZ ddlm Z ddl m Z ddl m Z ddlmZdd lmZmZmZGd d ZGd d ZGddeZdZy#e$rY4wxYw)N)deepcopy) itemgetter)remove)array)sparse)svm)load_svmlight_file)DependencyEvaluatorDependencyGraphParserIc*eZdZdZdZdZddZdZy) Configurationa Class for holding configuration which is the partial analysis of the input sentence. The transition based parser aims at finding set of operators that transfer the initial configuration to the terminal configuration. The configuration includes: - Stack: for storing partially proceeded words - Buffer: for storing remaining input words - Set of arcs: for storing partially built dependency tree This class also provides a method to represent a configuration as list of features. cdg|_ttdt|j|_g|_|j|_t|j |_y)z :param dep_graph: the representation of an input in the form of dependency graph. :type dep_graph: DependencyGraph where the dependencies are not specified. rN) stacklistrangelennodesbufferarcs_tokens _max_address)self dep_graphs a/mnt/ssd/data/python-lab/Trading/venv/lib/python3.12/site-packages/nltk/parse/transitionparser.py__init__zConfiguration.__init__(sL S 5C $89:     ,cdt|jzdzt|jzdzt|jzS)NzStack : z Buffer : z Arcs : )strrrr)rs r__str__zConfiguration.__str__4sQ $**o  $++    $))n   rc*|y|dk(ry|dur|dk(ryy)zs Check whether a feature is informative The flag control whether "_" is informative or not F_T)rfeatflags r_check_informativez Configuration._check_informative>s* < 2: 5=s{rc g}t|jdkDr|jt|jdz }|j|}|j|ddr|j d|dzd|vr+|j|dr|j d|dz|j|dr|j d |dzd |vrC|j|d r/|d j d }|D]}|j d |zt|jdkDr_|jt|jd z }|j|}|j|dr|j d|dzd}d}d} d} |j D]*\} } } | |k(s | | kDr | |kDr| }| } | | ks!| |ks'| }| } ,|j| r|j d| z|j| r|j d| zt|jdkDr|jd}|j|}|j|ddr|j d|dzd|vr+|j|dr|j d|dz|j|dr|j d|dzd |vrC|j|d r/|d j d }|D]}|j d|zt|jdkDru|jd}|j|}|j|ddr|j d|dz|j|dr|j d|dzt|jd kDrI|jd }|j|}|j|dr|j d|dzt|jdkDrI|jd}|j|}|j|dr|j d|dzd}d}d} d} |j D]*\} } } | |k(s | | kDr | |kDr| }| } | | ks!| |ks'| }| } ,|j| r|j d| z|j| r|j d| z|S)a/ Extract the set of features for the current configuration. Implement standard features as describe in Table 3.2 (page 31) in Dependency Parsing book by Sandra Kubler, Ryan McDonal, Joakim Nivre. Please note that these features are very basic. :return: list(str) rrwordT STK_0_FORM_lemma STK_0_LEMMA_tag STK_0_POS_feats| STK_0_FEATS_ STK_1_POS_i@Br# STK_0_LDEP_ STK_0_RDEP_ BUF_0_FORM_ BUF_0_LEMMA_ BUF_0_POS_ BUF_0_FEATS_ BUF_1_FORM_ BUF_1_POS_ BUF_2_POS_ BUF_3_POS_ BUF_0_LDEP_ BUF_0_RDEP_)rrrr(appendsplitrr)rresult stack_idx0tokenr0r& stack_idx1 left_most right_most dep_left_mostdep_right_mostwirwj buffer_idx0 buffer_idx1 buffer_idx2 buffer_idx3s rextract_featureszConfiguration.extract_featuresLs tzz?Q C Oa$78JLL,E&&uV}d; meFm;<%D$;$;E'N$K nuW~=>&&uU|4 lU5\9:%D$;$;E'N$Kg,,S1!9DMM.4"7894::"!ZZDJJ!(;<  Z0**5<8MM,u"=> IJMN!YY * Ar#Rb:o%' )*Rb9n$& ()  *&&}5 mm;<&&~6 mn<= t{{ a ++a.KLL-E&&uV}d; meFm;<%D$;$;E'N$K nuW~=>&&uU|4 lU5\9:%D$;$;E'N$Kg,,S1!9DMM.4"7894;;!#"kk!n  [1**5=$?MM-%-"?@**5<8MM,u"=>4;;!#"kk!n  [1**5<8MM,u"=>4;;!#"kk!n  [1**5<8MM,u"=>IJMN!YY * Ar$Rb:o%' )*Rb9n$& ()  *&&}5 mm;<&&~6 mn<= rN)F)__name__ __module__ __qualname____doc__rr!r(rTr%rrrrs  -  drrc>eZdZdZdZdZdZdZdZdZ dZ d Z d Z y ) Transitionz This class defines a set of transition which is applied to a configuration to get another configuration Note that for different parsing algorithm, the transition is different. LEFTARCRIGHTARCSHIFTREDUCEc||_|tjtjfvr.t dtjdtjdy)z :param alg_option: the algorithm option of this parser. Currently support `arc-standard` and `arc-eager` algorithm :type alg_option: str  Currently we only support  and  N)_algoTransitionParser ARC_STANDARD ARC_EAGER ValueError)r alg_options rrzTransition.__init__sV     ) )  & &  #002B2L2LN   rct|jdkst|jdkry|jddk(ry|jt|jdz }d}|jtj k(r|j D]\}}}||k(s d}|rH|jj|jd}|j j|||fyy)a Note that the algorithm for left-arc is quite similar except for precondition for both arc-standard and arc-eager :param configuration: is the current configuration :return: A new configuration or -1 if the pre-condition is not satisfied rr5rTFN) rrrrcrdrfrpoprC) rconfrelationidx_wir' idx_parentrN idx_childidx_wjs rleft_arczTransition.left_arcs   !s4::!'; ;;q>Q C Oa/0 ::)33 3,0II !( Ay& D !  JJNN [[^F II  fh7 8rc@t|jdkst|jdkry|jtj k(rW|jj }|jd}||jd<|jj|||fy|jt|jdz }|jj d}|jj||jj|||fy)z Note that the algorithm for right-arc is DIFFERENT for arc-standard and arc-eager :param configuration: is the current configuration :return: A new configuration or -1 if the pre-condition is not satisfied rr5rN) rrrrcrdrerjrrC)rrkrlrmrps r right_arczTransition.right_arcs   !s4::!'; ::)66 6ZZ^^%F[[^F#DKKN II  fh7 8ZZDJJ! 34F[[__Q'F JJ  f % II  fh7 8rc4|jtjk7ryt|jdkry|jt|jdz }d}|j D]\}}}||k(s d}|r|jj yy)z Note that the algorithm for reduce is only available for arc-eager :param configuration: is the current configuration :return: A new configuration or -1 if the pre-condition is not satisfied r5rrFTN)rcrdrfrrrrj)rrkrmr'rnrNros rreducezTransition.reduces ::)33 3 tzz?a C Oa/0(,   $J9F"   JJNN rct|jdkry|jjd}|jj |y)z Note that the algorithm for shift is the SAME for arc-standard and arc-eager :param configuration: is the current configuration :return: A new configuration or -1 if the pre-condition is not satisfied rr5N)rrrjrrC)rrkrms rshiftzTransition.shifts< t{{ q # &!rN) rUrVrWrXLEFT_ARC RIGHT_ARCr]r^rrqrsrurwr%rrrZrZs6 HI E F 89(. "rrZcPeZdZdZdZdZdZdZdZdZ dZ d Z d Z dd Z d Zy )rdzl Class for transition based parser. Implement 2 algorithms which are "arc-standard" and "arc-eager" z arc-standardz arc-eagerc||j|jfvr&td|jd|jd||_i|_i|_i|_y)z :param algorithm: the algorithm option of this parser. Currently support `arc-standard` and `arc-eager` algorithm :type algorithm: str r`rarbN)rerfrg _algorithm _dictionary _transition_match_transition)r algorithms rrzTransitionParser.__init__*s] d//@@$$dnn6 $!#rcl|j|}|j|}|dy|d|dk(r|dSy)Nr*headaddressrel)r)rrnrodepgraphp_nodec_nodes r_get_dep_relationz"TransitionParser._get_dep_relation:sI + * &> ! &>VI. .%= rcg}|D]O}|jj|t|j|j|j|Qdj dt |DS)z :param features: list of feature string which is needed to convert to binary features :type features: list(str) :return : string of binary features in libsvm format which is 'featureID:value' pairs rbc38K|]}t|dzyw)z:1.0N)r ).0 featureIDs r z?TransitionParser._convert_to_binary_features..Rs (1C NV # s)r} setdefaultrrCjoinsorted)rfeaturesunsorted_resultfeatures r_convert_to_binary_featuresz,TransitionParser._convert_to_binary_featuresFsw  >G    ' 'T5E5E1F G  " "4#3#3G#< = > xx 5;O5L   rchg}|jD]6}|j|}d|vs|d}|d}|$|j||f8|D]f\}}||kDr|}|}|}t|dz|D]D}tt|jD]!} | |ks| |kDs|| f|vry| |f|vsyFhy)NrrrFT)rrCrr) rrarc_listkeynodechildIdx parentIdxtempkms r_is_projectivezTransitionParser._is_projectiveVs>> ;C>>#&D~ ? L (OOY$9: ;$, ) Ix)#$ 8a<3 )s8>>23)AH !i-q6X-#(q6X-#( ) ) )rc"|jj|t|jdz||j|j|<t |j|dz|zdz}|j |j dy)z^ write the binary features to input file and update the transition dictionary rrb zutf-8N)r~rrrr writeencode)rrbinary_features input_file input_strs r_write_to_filezTransitionParser._write_to_fileps ##CT-=-=)>)BC8;t//45((-.4FM ))'23rct|j}d}g}|D]#}|j|s|dz }t|}t |j dkDs?|j d}|j } |j| } t |jdkDr=|jt |jdz } |j|| |} | Mtjdz| z} |j| | ||j|| |j| |j| ||} | d}|j}t|dzD]2}||k7s |j|||}||||f|j vs1d}4|rNtj"dz| z} |j| | ||j%|| |j| tj&} |j| | ||j)||j| t |j dkDr&t+dt-t |zt+dt-|z|S)z Create the training example in the libsvm format and write it to the input_file. Reference : Page 32, Chapter 3. Dependency Parsing by Sandra Kubler, Ryan McDonal and Joakim Nivre (2009) rr:TF Number of training examples : ) Number of valid (projective) examples : )rZrerrrrrTrrrrxrrqrCrrrryrsr]rwprintr )r depgraphsr operation count_proj training_seqrrkb0rrs0rr preconditionmaxIDwrelws r!_create_training_examples_arc_stdz2TransitionParser._create_training_examples_arc_stdzs` t001   !/ )H&&x0 !OJ *Ddkk"Q&[[^002"&"B"B8"Ltzz?Q&C Oa$78B00RBC(11C7#=++C*M!**45$++C0 00RBC'+ $ 1 1!&uqy!1=A Bw'+'='=b!X'N#'#3(*D!}DII'E7< =(","6"6"++C*M!++D#6$++C0 !D"2Y(11!RBN#'D11"aBN#'D ( (//++C*M!((.$++C0 !&&##C*E%##C(Wdkk"Q& 1 )f /#c)n2EEF 9C NJKrc   tjdtjd}|j|jk(r|j ||n|j |||jt|j\}}tjddddd|d }|j||tj|t|d t!|jy #t!jwxYw) z :param depgraphs : list of DependencyGraph as the training data :type depgraphs : DependencyGraph :param modelfile : file name to save the trained model :type modelfile : str ztransition_parse.trainF)prefixdirdeletepolyr3rg?g?T)kerneldegreecoef0gammaCverbose probabilitywbN)tempfileNamedTemporaryFile gettempdirr|rerrcloser namerSVCfitpickledumpopenr)rr modelfilerrx_trainy_trainmodels rtrainzTransitionParser.trains $!44/X5H5H5JSXJ$"3"3366y*M88JO    1*//B GW GG E IIgw ' KKtIt4 5 :?? #F:?? #s CC66D cvg}tjt|d}t|j}|D]}t |}t |jdkDra|j}g} g} g} |D]Q} | |jvs| j|j| | jd| jdStt| } t| }t| }tj||| ffdt |jf}i}|j|d}t!t |D] }||||< t|j#t%dd}|D]\}}|j&|}||j(vr|j(|}|j+dd}|tj,k(r*|j/||j+ddd k7sn|tj0k(r*|j3||j+ddd k7snb|tj4k(r|j7|d k7sn8|tj8k(s|j;|d k7sn t=d t |jdkDrat?|}|j@D]}|j@|}d |d <d|d <|jBD]\}}}|j@|} || d <|| d <!|j||S)aZ :param depgraphs: the list of test sentence, each sentence is represented as a dependency graph where the 'head' information is dummy :type depgraphs: list(DependencyGraph) :param modelfile: the model file :type modelfile: str :return: list (DependencyGraph) with the 'head' and 'rel' information rbrg?r)shapeT)rreverserr5z;The predicted transition is not recognized, expected errorsr#rr)"rloadrrZr|rrrrTr}rCrrr csr_matrix predict_probaritemsrclasses_rrDrxrqryrsr^rur]rwrgrrr)!rr modelFilerErrrrkrcolrowdatarnp_colnp_rownp_datax_test prob_dict pred_probi sorted_Prob y_pred_idx confidencey_pred strTransitionbaseTransition new_depgraphrrrrchildrs! rparsezTransitionParser.parse!s@ DD12t/ !Y (H *Ddkk"Q&002')G$"2"22 4#3#3G#<= 1  C( ) vc{+s+**vv./3t?O?O;P7Q, !//7: s9~.0A#,Q(>v(F )6)<)@'$Rhrrdcy)a6 >>> from nltk.parse import DependencyGraph, DependencyEvaluator >>> from nltk.parse.transitionparser import TransitionParser, Configuration, Transition >>> gold_sent = DependencyGraph(""" ... Economic JJ 2 ATT ... news NN 3 SBJ ... has VBD 0 ROOT ... little JJ 5 ATT ... effect NN 3 OBJ ... on IN 5 ATT ... financial JJ 8 ATT ... markets NNS 6 PC ... . . 3 PU ... """) >>> conf = Configuration(gold_sent) ###################### Check the Initial Feature ######################## >>> print(', '.join(conf.extract_features())) STK_0_POS_TOP, BUF_0_FORM_Economic, BUF_0_LEMMA_Economic, BUF_0_POS_JJ, BUF_1_FORM_news, BUF_1_POS_NN, BUF_2_POS_VBD, BUF_3_POS_JJ ###################### Check The Transition ####################### Check the Initialized Configuration >>> print(conf) Stack : [0] Buffer : [1, 2, 3, 4, 5, 6, 7, 8, 9] Arcs : [] A. Do some transition checks for ARC-STANDARD >>> operation = Transition('arc-standard') >>> operation.shift(conf) >>> operation.left_arc(conf, "ATT") >>> operation.shift(conf) >>> operation.left_arc(conf,"SBJ") >>> operation.shift(conf) >>> operation.shift(conf) >>> operation.left_arc(conf, "ATT") >>> operation.shift(conf) >>> operation.shift(conf) >>> operation.shift(conf) >>> operation.left_arc(conf, "ATT") Middle Configuration and Features Check >>> print(conf) Stack : [0, 3, 5, 6] Buffer : [8, 9] Arcs : [(2, 'ATT', 1), (3, 'SBJ', 2), (5, 'ATT', 4), (8, 'ATT', 7)] >>> print(', '.join(conf.extract_features())) STK_0_FORM_on, STK_0_LEMMA_on, STK_0_POS_IN, STK_1_POS_NN, BUF_0_FORM_markets, BUF_0_LEMMA_markets, BUF_0_POS_NNS, BUF_1_FORM_., BUF_1_POS_., BUF_0_LDEP_ATT >>> operation.right_arc(conf, "PC") >>> operation.right_arc(conf, "ATT") >>> operation.right_arc(conf, "OBJ") >>> operation.shift(conf) >>> operation.right_arc(conf, "PU") >>> operation.right_arc(conf, "ROOT") >>> operation.shift(conf) Terminated Configuration Check >>> print(conf) Stack : [0] Buffer : [] Arcs : [(2, 'ATT', 1), (3, 'SBJ', 2), (5, 'ATT', 4), (8, 'ATT', 7), (6, 'PC', 8), (5, 'ATT', 6), (3, 'OBJ', 5), (3, 'PU', 9), (0, 'ROOT', 3)] B. Do some transition checks for ARC-EAGER >>> conf = Configuration(gold_sent) >>> operation = Transition('arc-eager') >>> operation.shift(conf) >>> operation.left_arc(conf,'ATT') >>> operation.shift(conf) >>> operation.left_arc(conf,'SBJ') >>> operation.right_arc(conf,'ROOT') >>> operation.shift(conf) >>> operation.left_arc(conf,'ATT') >>> operation.right_arc(conf,'OBJ') >>> operation.right_arc(conf,'ATT') >>> operation.shift(conf) >>> operation.left_arc(conf,'ATT') >>> operation.right_arc(conf,'PC') >>> operation.reduce(conf) >>> operation.reduce(conf) >>> operation.reduce(conf) >>> operation.right_arc(conf,'PU') >>> print(conf) Stack : [0, 3, 9] Buffer : [] Arcs : [(2, 'ATT', 1), (3, 'SBJ', 2), (0, 'ROOT', 3), (5, 'ATT', 4), (3, 'OBJ', 5), (5, 'ATT', 6), (8, 'ATT', 7), (6, 'PC', 8), (3, 'PU', 9)] ###################### Check The Training Function ####################### A. Check the ARC-STANDARD training >>> import tempfile >>> import os >>> input_file = tempfile.NamedTemporaryFile(prefix='transition_parse.train', dir=tempfile.gettempdir(), delete=False) >>> parser_std = TransitionParser('arc-standard') >>> print(', '.join(parser_std._create_training_examples_arc_std([gold_sent], input_file))) Number of training examples : 1 Number of valid (projective) examples : 1 SHIFT, LEFTARC:ATT, SHIFT, LEFTARC:SBJ, SHIFT, SHIFT, LEFTARC:ATT, SHIFT, SHIFT, SHIFT, LEFTARC:ATT, RIGHTARC:PC, RIGHTARC:ATT, RIGHTARC:OBJ, SHIFT, RIGHTARC:PU, RIGHTARC:ROOT, SHIFT >>> parser_std.train([gold_sent],'temp.arcstd.model', verbose=False) Number of training examples : 1 Number of valid (projective) examples : 1 >>> input_file.close() >>> remove(input_file.name) B. Check the ARC-EAGER training >>> input_file = tempfile.NamedTemporaryFile(prefix='transition_parse.train', dir=tempfile.gettempdir(),delete=False) >>> parser_eager = TransitionParser('arc-eager') >>> print(', '.join(parser_eager._create_training_examples_arc_eager([gold_sent], input_file))) Number of training examples : 1 Number of valid (projective) examples : 1 SHIFT, LEFTARC:ATT, SHIFT, LEFTARC:SBJ, RIGHTARC:ROOT, SHIFT, LEFTARC:ATT, RIGHTARC:OBJ, RIGHTARC:ATT, SHIFT, LEFTARC:ATT, RIGHTARC:PC, REDUCE, REDUCE, REDUCE, RIGHTARC:PU >>> parser_eager.train([gold_sent],'temp.arceager.model', verbose=False) Number of training examples : 1 Number of valid (projective) examples : 1 >>> input_file.close() >>> remove(input_file.name) ###################### Check The Parsing Function ######################## A. Check the ARC-STANDARD parser >>> result = parser_std.parse([gold_sent], 'temp.arcstd.model') >>> de = DependencyEvaluator(result, [gold_sent]) >>> de.eval() >= (0, 0) True B. Check the ARC-EAGER parser >>> result = parser_eager.parse([gold_sent], 'temp.arceager.model') >>> de = DependencyEvaluator(result, [gold_sent]) >>> de.eval() >= (0, 0) True Remove test temporary files >>> remove('temp.arceager.model') >>> remove('temp.arcstd.model') Note that result is very poor because of only one training example. Nr%r%rrdemorsr)rrcopyroperatorrosrnumpyrscipyrsklearnrsklearn.datasetsr ImportError nltk.parser r r rrZrdrr%rrrst 3EDVVrl"l"^gwgT Mq  sA!!A)(A)