L iNddlmZddlmZddlmZddlmZmZm Z ddl m Z m Z m Z mZmZmZmZmZmZmZmZmZmZmZmZmZmZmZGddZGd d eZGd d eZGd deZ GddeZ!GddeZ"GddeZ#GddeZ$GddeZ%GddeZ&ed d!dZ'ed d" d#dZ(y )$) annotations) lru_cache) getLogger)COMMON_SAFE_ASCII_CHARACTERSTRACEUNICODE_SECONDARY_RANGE_KEYWORD)is_accentuated is_arabicis_arabic_isolated_formis_case_variableis_cjk is_emoticon is_hangul is_hiragana is_katakanais_latinis_punctuation is_separator is_symbolis_thaiis_unprintable remove_accent unicode_rangeis_cjk_uncommonc:eZdZdZddZddZd dZed dZy) MessDetectorPluginzy Base abstract class used for mess detection plugins. All detectors MUST extend and implement given methods. ct)z@ Determine if given character should be fed in. NotImplementedErrorself characters [/mnt/ssd/data/python-lab/Trading/venv/lib/python3.12/site-packages/charset_normalizer/md.pyeligiblezMessDetectorPlugin.eligible' "!ct)z The main routine to be executed upon character. Insert the logic in witch the text would be considered chaotic. rr!s r$feedzMessDetectorPlugin.feed-s "!r'ct)zB Permit to reset the plugin to the initial state. rr"s r$resetzMessDetectorPlugin.reset4r&r'ct)z Compute the chaos ratio based on what your feed() has seen. Must NOT be lower than 0.; No restriction gt 0. rr+s r$ratiozMessDetectorPlugin.ratio:s "!r'Nr#strreturnboolr#r0r1Noner1r4r1float) __name__ __module__ __qualname____doc__r%r)r,propertyr.r'r$rr!s* " "" ""r'rc>eZdZddZddZd dZddZed dZy) TooManySymbolOrPunctuationPlugincJd|_d|_d|_d|_d|_y)NrF)_punctuation_count _symbol_count_character_count_last_printable_char_frenzy_symbol_in_wordr+s r$__init__z)TooManySymbolOrPunctuationPlugin.__init__Ds*'("#%&04!,1#r'c"|jSN isprintabler!s r$r%z)TooManySymbolOrPunctuationPlugin.eligibleL$$&&r'c8|xjdz c_||jk7ro|tvrgt|r|xjdz c_||_y|j dur-t |r"t|dur|xjdz c_||_y)NrF) rCrDrrrAisdigitrrrBr!s r$r)z%TooManySymbolOrPunctuationPlugin.feedOs " 22 2!==i(''1,'%.! !!#u,i( *e3""a'"$-!r'c.d|_d|_d|_yNr)rArCrBr+s r$r,z&TooManySymbolOrPunctuationPlugin.resetas"# !r'c|jdk(ry|j|jz|jz }|dk\r|SdS)Nr333333?)rCrArB)r"ratio_of_punctuations r$r.z&TooManySymbolOrPunctuationPlugin.ratiofsO  A %  # #d&8&8 8  ! !'"(eZdZddZddZd dZddZed dZy) TooManyAccentuatedPluginc d|_d|_yrPrC_accentuated_countr+s r$rFz!TooManyAccentuatedPlugin.__init__ss%&'(r'c"|jSrH)isalphar!s r$r%z!TooManyAccentuatedPlugin.eligiblews  ""r'cp|xjdz c_t|r|xjdz c_yyNr)rCr rZr!s r$r)zTooManyAccentuatedPlugin.feedzs1 " ) $  # #q ( # %r'c d|_d|_yrPrYr+s r$r,zTooManyAccentuatedPlugin.resets !"#r'cf|jdkry|j|jz }|dk\r|SdS)NrRgffffff?rY)r"ratio_of_accentuations r$r.zTooManyAccentuatedPlugin.ratios=  1 $'+'>'>AVAV'V(=(E$N3Nr'Nr5r/r3r6rUr=r'r$rWrWrs,)#) $OOr'rWc>eZdZddZddZd dZddZed dZy) UnprintablePluginc d|_d|_yrP)_unprintable_countrCr+s r$rFzUnprintablePlugin.__init__s'(%&r'cyNTr=r!s r$r%zUnprintablePlugin.eligibler'cnt|r|xjdz c_|xjdz c_yr^)rrfrCr!s r$r)zUnprintablePlugin.feeds, ) $  # #q ( # "r'cd|_yrP)rfr+s r$r,zUnprintablePlugin.resets "#r'cZ|jdk(ry|jdz|jz S)NrrRra)rCrfr+s r$r.zUnprintablePlugin.ratios/  A %''!+t/D/DDDr'Nr5r/r3r6rUr=r'r$rdrds,'# $EEr'rdc>eZdZddZddZd dZddZed dZy) SuspiciousDuplicateAccentPluginc.d|_d|_d|_yrP_successive_countrC_last_latin_characterr+s r$rFz(SuspiciousDuplicateAccentPlugin.__init__s&'%&15"r'c<|jxr t|SrH)r\rr!s r$r%z(SuspiciousDuplicateAccentPlugin.eligibles  ":x '::r'c~|xjdz c_|jt|rt|jru|jr/|jjr|xjdz c_t |t |jk(r|xjdz c_||_yr^)rCrrr isupperrqrr!s r$r)z$SuspiciousDuplicateAccentPlugin.feeds "  & & 2y)t99:  "t'A'A'I'I'K&&!+&Y'=9S9S+TT&&!+&%."r'c.d|_d|_d|_yrPrpr+s r$r,z%SuspiciousDuplicateAccentPlugin.resets!" !%)"r'cZ|jdk(ry|jdz|jz S)NrrRrM)rCrqr+s r$r.z%SuspiciousDuplicateAccentPlugin.ratios/  A %&&*d.C.CCCr'Nr5r/r3r6rUr=r'r$rnrns,6 ; /* DDr'rnc>eZdZddZddZd dZddZed dZy) SuspiciousRangec.d|_d|_d|_yrP)"_suspicious_successive_range_countrC_last_printable_seenr+s r$rFzSuspiciousRange.__init__s78/%&04!r'c"|jSrHrIr!s r$r%zSuspiciousRange.eligiblerKr'c<|xjdz c_|jst|s|tvrd|_y|j||_yt |j}t |}t ||r|xjdz c_||_yr^)rCisspacerrr|r is_suspiciously_successive_ranger{)r"r#unicode_range_aunicode_range_bs r$r)zSuspiciousRange.feeds "    i(88(,D %   $ $ ,(1D % &3D4M4M&N&3I&> +O_ M  3 3q 8 3$-!r'c.d|_d|_d|_yrP)rCr{r|r+s r$r,zSuspiciousRange.resets !23/$(!r'c^|jdkry|jdz|jz }|S)N rRrM)rCr{)r"ratio_of_suspicious_range_usages r$r.zSuspiciousRange.ratios<  B &  3 3a 7  ! !2"'/.r'Nr5r/r3r6rUr=r'r$ryrys*5 '..) //r'ryc>eZdZddZddZd dZddZed dZy) SuperWeirdWordPlugincd|_d|_d|_d|_d|_d|_d|_d|_d|_d|_ y)NrF) _word_count_bad_word_count_foreign_long_count_is_current_word_bad_foreign_long_watchrC_bad_character_count_buffer_buffer_accent_count_buffer_glyph_countr+s r$rFzSuperWeirdWordPlugin.__init__sQ !$%() */!). %&)*! )*!() r'cyrhr=r!s r$r%zSuperWeirdWordPlugin.eligible rir'c|jr|xj|z c_t|r|xjdz c_|jdur`t |dus t|rHt |dur;t|dur.t|dur!t|durt|durd|_t |s,t|s!t|st|s t|r|xjdz c_ y|jsy|jst|s t|r|jr|xjdz c_t!|j}|xj"|z c_|dk\r|j|z dk\rd|_nt|jdrX|jdj'r;t)d|jDdur|xj*dz c_d|_n+|jdk(rd|_|xj*dz c_|dk\r|jrwt-|jt/d |Dcgc]\}}|j'r|}}}d}|rt!||z d krd}|s|xj*dz c_d|_|j$rD|xj0dz c_|xj2t!|jz c_d|_d|_d |_d |_d |_ y|d vr<|j5dur)t7|rd|_|xj|z c_yyyycc}}w) NrFT?c3<K|]}|jywrH)ru).0_s r$ z,SuperWeirdWordPlugin.feed..8s>AAIIK>srrSr>r-<=>|~)r\rr rrrrrrrrrrrrrlenrCrruallrziprangerrrNr)r"r# buffer_lengthcicamel_case_dstprobable_camel_caseds r$r)zSuperWeirdWordPlugin.feeds      LLI %Li())Q.)((E1i(E1^I5N9%.i(E1 *e3 *e3I&%/+/(y!Y'y)y)9%((A-( ||     >)#< Y@Wll    ! !$T\\!2M  ! !] 2 !!,,}<C04D-#4<<#34 R(002>>>%G,,1,04D---204D-,,1,"t'?'?!$DLL%=2I J"1yy{"" .3$!s>':]'Jc'Q+/(+,,1,04D-(($$)$))S->>),1)',D $DL()D %'(D $ @ @!!#u,)$(,D % LLI %L%- A1"s/M1ctd|_d|_d|_d|_d|_d|_d|_d|_y)NrFr)rrrrrrCrrr+s r$r,zSuperWeirdWordPlugin.reset_sA $)!#(   !$%!#$ r'cr|jdkr|jdk(ry|j|jz S)N rrR)rrrrCr+s r$r.zSuperWeirdWordPlugin.ratiois7   r !d&>&>!&C((4+@+@@@r'Nr5r/r3r6rUr=r'r$rrs. *O&b%AAr'rcBeZdZdZddZd dZd dZddZed dZ y) CjkUncommonPluginz< Detect messy CJK text that probably means nothing. c d|_d|_yrPrC_uncommon_countr+s r$rFzCjkUncommonPlugin.__init__vs%&$%r'ct|SrH)rr!s r$r%zCjkUncommonPlugin.eligiblezs i  r'cp|xjdz c_t|r|xjdz c_yyr^)rCrrr!s r$r)zCjkUncommonPlugin.feed}s4 " 9 %  A %  &r'c d|_d|_yrPrr+s r$r,zCjkUncommonPlugin.resets ! r'cl|jdkry|j|jz }|dkDr|dz SdS)NrarRrrr)r"uncommon_form_usages r$r.zCjkUncommonPlugin.ratiosD  1 $%)%9%9DeZdZddZddZd dZddZed dZy) ArchaicUpperLowerPlugincfd|_d|_d|_d|_d|_d|_d|_y)NFrT)_buf_character_count_since_last_sep_successive_upper_lower_count#_successive_upper_lower_count_finalrC_last_alpha_seen_current_ascii_onlyr+s r$rFz ArchaicUpperLowerPlugin.__init__s9 45,23*890%&,0)- r'cyrhr=r!s r$r%z ArchaicUpperLowerPlugin.eligiblerir'cZ|jxr t|}|du}|r|jdkDr|jdkr?|jdur-|jdur|xj |j z c_d|_d|_d|_d|_|xjdz c_ d|_y|jdur|jdurd|_|j|jr|jjs*|jrM|jjr3|jdur|xj dz c_d|_nd|_nd|_|xjdz c_ |xjdz c_||_y)NFr@rTrM) r\r rrNrrrrrrCisasciiruislower)r"r# is_concerned chunk_seps r$r)zArchaicUpperLowerPlugin.feeds ((*J/? /J  E) ==A44:%%'50,,58866823D .34D 0$(D !DI  ! !Q & !'+D $   # #t + 0A0A0Cu0L',D $  ,!!#(=(=(E(E(G!!#(=(=(E(E(G99$66!;6 %DI $DI!  " ,,1, )r'cfd|_d|_d|_d|_d|_d|_d|_y)NrFT)rCrrrrrrr+s r$r,zArchaicUpperLowerPlugin.resets9 !/0,-.*340 $ #' r'cT|jdk(ry|j|jz S)NrrR)rCrr+s r$r.zArchaicUpperLowerPlugin.ratios*  A %77$:O:OOOr'Nr5r/r3r6rUr=r'r$rrs- .(*T(PPr'rc>eZdZddZddZddZd dZed dZy) ArabicIsolatedFormPluginc d|_d|_yrPrC_isolated_form_countr+s r$rFz!ArabicIsolatedFormPlugin.__init__s%&)*!r'c d|_d|_yrPrr+s r$r,zArabicIsolatedFormPlugin.resets !$%!r'ct|SrH)r r!s r$r%z!ArabicIsolatedFormPlugin.eligibles ##r'cp|xjdz c_t|r|xjdz c_yyr^)rCr rr!s r$r)zArabicIsolatedFormPlugin.feeds1 " "9 -  % % * % .r'cX|jdkry|j|jz }|S)NrarRr)r"isolated_form_usages r$r.zArabicIsolatedFormPlugin.ratios0  1 $%)%>%>AVAV%V""r'Nr5r/r3r6) r8r9r:rFr,r%r)r<r.r=r'r$rrs*+&$+ ##r'r)maxsizec||y||k(ryd|vrd|vryd|vsd|vryd|vsd|vr d|vsd|vry|jd|jd}}|D]}|tvr ||vsy|dv|dv}}|s|r d|vsd|vry|r|ryd |vsd |vrd|vsd|vry|d k(s|d k(ryd|vs d|vs|d vr!|d vrd |vsd |vryd |vsd |vry|d k(s|d k(ryy)za Determine if two Unicode range seen next to each other can be considered as suspicious. TFLatin Emoticons Combining )HiraganaKatakanaCJKHangulz Basic Latin)rr PunctuationForms)splitr )rrkeywords_range_akeywords_range_belrange_a_jp_charsrange_b_jp_charss r$rrs/"9/)/!g&@o%)G ?"g&@&+*H c"c"'  0 0  ! !    33 ' ,  E_$<,?"h/&A O #u'? m +-/O  E_$<33 7 7 O +}/O o %O)C m +-/O r'ic tjDcgc] }| }}t|dz}d}|dkrd}n |dkrd}nd}t|dzt |D]^\}} |D]%} | j |s| j |'| d kDr| |zd k(s | |dz k(sFtd |D}||k\s^n|rtd } | jtd |d |d|t|dkDr8| jtd|dd| jtd|dd|D]1} | jt| jd| j3t|dScc}w)zw Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier. rrRi rr rc34K|]}|jywrH)r.)rdts r$rzmess_ratio..es!?r"((!?scharset_normalizerzIMess-detector extended-analysis start. intermediary_mean_mess_ratio_calc=z mean_mess_ratio=z maximum_threshold=zStarting with: Nz Ending with: iz: )r__subclasses__rrrr%r)sumrlogr __class__r.round) decoded_sequencemaximum_thresholddebugmd_class detectorslengthmean_mess_ratio!intermediary_mean_mess_ratio_calcr#indexdetectorloggerrs r$ mess_ratiorFs$6#D#D#F+ +I+&'!+F O |13) 4,.),/) 04 7vG  5! )H  + i( ) AI%"CCqH fqj !!?Y!??O"33  /0  11R0SSdetduv!!2 3 5   2 % JJu0@"0E/FG H JJu .>su.E-FG H =B JJub ; < = ! $$[+sE6N)r str | Nonerrr1r2)g?F)rr0rr7rr2r1r7)) __future__r functoolsrloggingrconstantrrr utilsr r r r rrrrrrrrrrrrrrrr?rWrdrnryrrrrrrr=r'r$r s<" ,""D,L'9,L^O1O6E*E0"D&8"DJ./(./bsA-sAl N* NFIP0IPX#1#8 4FF2<F FFR 4IN4%4%.34%BF4% 4%4%r'