JL i dZ ddlZedZdZdZdZdZ gdZ gd Z gd Z id d d ddddddddddddddddddd d!d d"d#d$d d%dd&dd'dd(dd)d*d+dd dd+d dd+d d+d,Z dd-d.dd/d0d0ddd1d2d2d3 Zid4d d$d5d5d5d5d5d5d6d7d d$d5d8d5d5d5d5d6d9dd$d5d5d5d5d5d5d6d:dd$d5d8d5d5d5d5d6d;dd$d5d5d5d8d5d5d6ddd$d5d8d5d5d5d5d6d?dd$d5d5d5d5d5d5d6d@dd$d5d8d5d5d5d5d6dAdd$d5d5d5d5d5d5d6dBdd$d5d8d5d5d5d5d6dCdd$d5d5d5d5d5d5d6dDd d$d5d8d8d5d5d5d6dEd d$d5d8d8d5d5d5d6dFdd$d5d8d8d5d5d5d6dGdd$d5d8d8d8d5d5d6idHdd$d5d8d8d5d5d5d6dIdd$d5d8d8d5d5d5d6dJdd$d5d8d8d5d5d5d6dKdd$d5d8d8d5d5d5d6dLd d'd5d8d5d5d5d5d6dMd d'd5d8d5d5d5d5d6dNdd'd5d8d5d8d5d5d6dOdd'd5d8d5d5d5d5d6dPdd'd5d8d5d5d5d5d6dQddRd5d8d5d5d5d5d6dSddRd5d8d5d8d5d5d6dTd d&d5d5d5d5d5d5d6dUd d&d5d8d5d5d5d5d6dVd d&d5d5d5d5d5d5d6dWd d&d5d8d5d5d5d5d6dXdd&d5d5d5d5d5d5d6dYdd&d5d8d5d5d5d5d6idZdd&d5d5d5d5d5d5d6d[dd&d5d8d5d5d5d5d6d\dd&d5d5d5d5d5d5d6d]dd&d5d8d5d5d5d5d6d^dd&d5d5d5d8d5d5d6d_dd&d5d8d5d8d5d5d6d`dd&d5d5d5d5d5d5d6dadd&d5d8d5d5d5d5d6dbdd&d5d5d5d5d5d5d6dcdd&d5d8d5d5d5d5d6dddd&d5d5d5d5d5d5d6dedd&d5d8d5d5d5d5d6dfdd&d5d5d5d5d5d5d6dgdd&d5d8d5d5d5d5d6dhdd&d5d5d5d5d5d5d6didd&d5d8d5d5d5d5d6djdd&d5d5d5d5d8d5d6idkdd&d5d8d5d5d8d5d6dld dmd5d8d5d5d5d5d6dnddmd5d8d5d5d5d5d6doddmd5d8d5d8d5d5d6dpddmd5d8d5d5d5d5d6dqddmd5d8d5d5d5d5d6drddmd5d8d5d5d8d5d6dsd!dmd5d8d5d5d5d5d6dtd"dud8d8d5d5d5dvdwd5d5d5dx dyd"dud8d8d5d5d5dvdwd8d5d5dx dzd"dud8d8d5d5d5d{dwd5d5d5dx d|d"dud8d8d5d5d5d{dwd5d8d5dx d}d"dud8d8d5d5d5d{dwd8d5d5dx d~d"dud8d8d5d5d5d{dwd8d5d5dx dd"dud8d8d5d5d5d{dwd5d5d5dx dd"dud8d8d5d5d5d{dwd8d5d5dx dd"dud8d8d5d5d5ddwd5d5d5dx idd"dud8d8d5d5d5ddwd5d5d5dx dd"dud8d8d5d5d5ddd5d5d5dx dd"dud8d8d5d5d5ddd5d5d5dx dd"dud8d8d5d5d5ddwd8d5d5dx dd"dud8d8d5d5d5ddwd5d8d5dx dd"dud8d8d5d5d5dvdd5d5d5dx dd"dud8d8d5d5d5dvdd8d5d5dx dd"dud8d8d5d5d5d{dd5d5d5dx dd"dud8d8d5d5d5d{dd5d5d5dx dd"dud8d8d5d5d5d{dd8d5d5dx dd"dud8d8d5d5d5dvdd8d5d5dx dd"dud8d8d5d5d5dvdd8d8d5dx dd"dud8d8d5d5d5d{dd8d5d5dx dd"dud8d8d5d5d5d{dd8d5d5dx dd"dud8d8d5d5d5d{dd8d8d5dx dd"dud8d8d5d5d5d{dd8d5d5dx dd"dud8d8d5d5d5d{dd5d5d5dx d"dud8d8d5d5d5ddd8d5d5dx d"dud8d8d5d5d5ddd5d5d5dx d"dud8d8d5d5d5dvdwd5d8d5dx d"dud8d8d5d5d5dvdd8d5d5dx d"dud8d8d5d5d5dvdwd8d5d5dx d"dud8d8d5d5d5dvdd8d5d5dx d"dud8d8d5d5d5dvdd5d5d5dx d"dud8d8d5d5d5d{dwd5d5d5dx d"dud8d8d5d5d5dvdd8d5d5dx d"dud8d8d5d5d5d{dd5d5d5dx d"dud8d8d5d5d5d{dd5d5d5dx d ZddZdZdZdZdZdZdZdZdZdZdZedk(reyy#e$rdZYwxYw)u ALINE https://webdocs.cs.ualberta.ca/~kondrak/ Copyright 2002 by Grzegorz Kondrak. ALINE is an algorithm for aligning phonetic sequences, described in [1]. This module is a port of Kondrak's (2002) ALINE. It provides functions for phonetic sequence alignment and similarity analysis. These are useful in historical linguistics, sociolinguistics and synchronic phonology. ALINE has parameters that can be tuned for desired output. These parameters are: - C_skip, C_sub, C_exp, C_vwl - Salience weights - Segmental features In this implementation, some parameters have been changed from their default values as described in [1], in order to replicate published results. All changes are noted in comments. Example usage ------------- # Get optimal alignment of two phonetic sequences >>> align('θin', 'tenwis') # doctest: +SKIP [[('θ', 't'), ('i', 'e'), ('n', 'n'), ('-', 'w'), ('-', 'i'), ('-', 's')]] [1] G. Kondrak. Algorithms for Language Reconstruction. PhD dissertation, University of Toronto. Ninfi#-)<BNRbcdfghjklmnpqrstvxzçðħŋɖɟɢɣɦɬɮɰɱɲɳɴɸɹɻɽɾʀʁʂʃʈʋuʐ ʒʔʕʙʝβθχʐw) aspiratedlateralmannernasalplace retroflexsyllabicvoice) backrClongrDrErFrGroundrHrIbilabial? labiodentalgffffff?dentalg?alveolarg333333?rGg?zpalato-alveolarg?palatalgffffff?velarg333333?uvularg? pharyngealg333333?glottalg? labiovelarvowelgstop affricate fricativetrillg?g?g?)tap approximantz high vowelz mid vowelz low vowelvowel2highmidlowfrontcentralrJplusminus(2 ) rHrFrDrIrErGrCrBrKrarJrLrrg)rFrDrHrIrErGrCrBr rfrr r6r!r r"rrrr#r9rr)rr+r*r r,rr;rrr2r r1r^r0r-r=r rr>rrrr5r8r4r@rr<rr$r?r3rr:rr%r&r'r7r_r.r/rr(rrAir`rard) rFrDrHrIrErGrCrarJrLrKrByerbEøuø̞uɛuœærcaäreuɐuɶAuɨuʉuəuɜuɞurJUouo̞Ouɔuʌ) uɒuɑIuɯuʏuʊuɘue̞uɵuɤuɤ̞ct tdd|cxkr dksJdJdt|}t|}tj|dz|dzft}t d|dzD]}t d|dzD]}||dz |ft ||dz z}|||dz ft ||dz z} ||dz |dz ft||dz ||dz z} |dkDr'||dz |dz ft||dz ||dz |z} nt } |dkDr'||dz |dz ft||dz ||dz |z} nt } t|| | | | d|||f<d|z tj|z} g}t d|dzD]B}t d|dzD].}|||f| k\s|jt||d|| ||g0D|S) a Compute the alignment of two phonetic strings. :param str str1: First string to be aligned :param str str2: Second string to be aligned :type epsilon: float (0.0 to 1.0) :param epsilon: Adjusts threshold similarity score for near-optimal alignments :rtype: list(list(tuple(str, str))) :return: Alignment(s) of str1 and str2 (Kondrak 2002: 51) z1You need numpy in order to use the align functionr]rNz$Epsilon must be between 0.0 and 1.0.)dtypermr)np ImportErrorlenzerosfloatrange sigma_skip sigma_sub sigma_exprmaxamaxappend _retrieve)str1str2epsilonrrSrnredit1edit2edit3edit4edit5T alignmentss X/mnt/ssd/data/python-lab/Trading/venv/lib/python3.12/site-packages/nltk/metrics/aline.pyalignr$s@ zMNN ' S H"HH H"HH  D A D A !a%Qu-A 1a!e_ @q!a% @Aa!eQhK*T!a%["99EaQhK*T!a%["99Ea!eQUlOiQU T!a%[&IIE1u!a%Q,)DQKa!ea*QQ1u!a%Q,)DQKa!ea*QQ%ueQ?AadG @ @ W "AJ 1a!e_Lq!a% LAAw!|!!)Aq!Q4r"JK LL c |||fdk(r|S|dkDr||dz |dz ft||dz ||dz |z|z|k\rS|jd||dz ||dz |ft|dz |dz |t||dz ||dz |z||||||S|dkDr||dz |dz ft||dz ||dz |z|z|k\rS|jd||dz |||dz ft|dz |dz |t||dz ||dz |z||||||S|||dz ft||dz z|z|k\rC|jdd||dz ft||dz |t||dz z||||||S||dz |ft||dz z|z|k\rC|jd||dz dft|dz ||t||dz z||||||S||dz |dz ft ||dz ||dz z|z|k\rQ|jd||dz ||dz ft|dz |dz |t ||dz ||dz z||||||S)z Retrieve the path through the similarity matrix S starting at (i, j). :rtype: list(tuple(str, str)) :return: Alignment of str1 and str2 rr}rm-)rinsertrrr)rnrrrrrrouts rrrXs$ Aw!| q5Qq1ua!e|_ya!ed1q51o'NNQRRVWW JJq4A;QUQ8 9 AAId1q5k4A?;; T J? EaAq1u o $q1u+tAEA(OORSSWXX JJq4A?DQK8 9 AAId1q5k4A?;; 8 J%q!a%x[:d1q5k2 2Q 6! ; JJq3QU , - aQJtAE{$; ;Q4s S Jq1uax[:d1q5k2 2Q 6! ; JJq4A;, - a!eQJtAE{$; ;Q4s S Jq1ua!e|_ya!ed1q5kB BQ F! K JJq4A;QU 4 5 AAId1q5k4A;77  JrctS)zA Returns score of an indel of P. (Kondrak 2002: 54) )C_skiprs rrrs MrcXtt||z t|z t|z S)zN Returns score of a substitution of P with Q. (Kondrak 2002: 54) )C_subdeltaVrrs rrrs' 5A; 1 %! ,,rc|d}|d}tt||z t||z t|z tt|t|z S)zL Returns score of an expansion/compression. (Kondrak 2002: 54) rr})C_exprrr)rrq1q2s rrrsO 1B 1B 5B< %2, .1 5AbE1R58I IIrc  t||}d}tDtj|Dcgc]}t|||c}|Dcgc] }t| c}S|D]}|t|||t|zz }|Scc}wcc}w)zT Return weighted sum of difference between P and Q. (Kondrak 2002: 54) r)r rdotdiffsalience)rrfeaturestotalr s rrrs AwH E ~vv$, -qT!Q] -X/N /N  - aA!,,- L ./Ns A<B cjt|t|}}tt||t||z S)zi Returns difference between phonetic segments P and Q for feature F. (Kondrak 2002: 52, 54) )feature_matrixabssimilarity_matrix)rrr p_features q_featuress rrrs; ,A.q0A J  A/2CJqM2RR SSrc:|tvs|tvrtStS)zR Return relevant features for segment comparison. (Kondrak 2002: 54) ) consonantsR_cR_vrs rr r s  J!z/ Jrc |tvrytS)zD Return vowel weight if P is vowel. (Kondrak 2002: 54) r)rC_vwlrs rrrs  J Lrc >tjdDcgc]}|jd}}|D]]}t|d|dd}|Dcgc]}d|dd|dd}}dj|}t |dd |dd |_y cc}wcc}w) zq A demonstration of the result of aligning phonetic sequences used in Kondrak's (2002) dissertation.  ,rr}(z, ) z ~ z : N) cognate_datasplitrjoinprint)pairdata alignmentrts rdemors )5(:(:4(@ ADJJsO AD A6$q'47+A. 2;rs>  El    = @  %%4% c %  %  %t%s% S% c%#%s%#% T%  C!%"#%$%%& S'%(        I%R      W W W, -W@ AWT  UWh  iW| }WP  QWd eWx yWL MW`  aWt  uWH IW\  ]Wp qWD  EWX  YWl  mW@  AWT UWh  iW| }WP QWd  eWx yWL  MW`  aWt  uWH   I W\  ] Wp  q WD   E WX   Y Wl  m W@  A WT  " U Wh  " i W|   } WP   Q Wd   e Wx   y WL  M W`   a Wt   u WH  IW\  ]Wp  qWD EWX  YWl  mW@  AWT  UWh  iW|  }WP QWd  eWx yWL MWb cW~ WZ [Wv wWR  SWn  oWJ  KWf  gWB  CW^ _Wz  {WV  WWr  sWN OWj  kWF  GWb  cW~  WZ  [Wv wWR SWn oWJ  KWf gWB  CW^  _W|           S"Wx"1h6r-J"T 6J Z zFe0 BsV33V>=V>