3
eG                 @   s<  d dl mZ d dlmZmZ ddlmZmZ ddlm	Z	m
Z
mZmZmZmZmZmZmZmZmZmZmZmZmZ G dd dZG dd	 d	eZG d
d deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZ ee! ee! e"dddZ#eddd#e!e$e"e$dd d!Z%d"S )$    )	lru_cache)ListOptional   )COMMON_SAFE_ASCII_CHARACTERSUNICODE_SECONDARY_RANGE_KEYWORD)is_accentuatedis_asciiis_case_variableis_cjkis_emoticon	is_hangulis_hiraganais_katakanais_latinis_punctuationis_separator	is_symbolis_thairemove_accentunicode_rangec               @   sP   e Zd ZdZeedddZeddddZddd	d
Ze	e
dddZdS )MessDetectorPluginzy
    Base abstract class used for mess detection plugins.
    All detectors MUST extend and implement given methods.
    )	characterreturnc             C   s   t dS )z@
        Determine if given character should be fed in.
        N)NotImplementedError)selfr    r   Y/var/www/html/StaffProfile/staffvenv/lib/python3.6/site-packages/charset_normalizer/md.pyeligible   s    zMessDetectorPlugin.eligibleNc             C   s   t dS )z
        The main routine to be executed upon character.
        Insert the logic in witch the text would be considered chaotic.
        N)r   )r   r   r   r   r   feed$   s    zMessDetectorPlugin.feed)r   c             C   s   t dS )zB
        Permit to reset the plugin to the initial state.
        N)r   )r   r   r   r   reset+   s    zMessDetectorPlugin.resetc             C   s   t dS )z
        Compute the chaos ratio based on what your feed() has seen.
        Must NOT be lower than 0.; No restriction gt 0.
        N)r   )r   r   r   r   ratio1   s    zMessDetectorPlugin.ratio)__name__
__module____qualname____doc__strboolr   r   r    propertyfloatr!   r   r   r   r   r      s   r   c               @   sZ   e Zd ZddddZeedddZedddd	Zddd
dZe	e
dddZdS ) TooManySymbolOrPunctuationPluginN)r   c             C   s"   d| _ d| _d| _d | _d| _d S )Nr   F)_punctuation_count_symbol_count_character_count_last_printable_charZ_frenzy_symbol_in_word)r   r   r   r   __init__;   s
    z)TooManySymbolOrPunctuationPlugin.__init__)r   r   c             C   s   |j  S )N)isprintable)r   r   r   r   r   r   C   s    z)TooManySymbolOrPunctuationPlugin.eligiblec             C   sp   |  j d7  _ || jkrf|tkrft|r8|  jd7  _n.|j dkrft|rft|dkrf|  jd7  _|| _d S )Nr   F   )	r-   r.   r   r   r+   isdigitr   r   r,   )r   r   r   r   r   r   F   s    
z%TooManySymbolOrPunctuationPlugin.feedc             C   s   d| _ d| _d| _d S )Nr   )r+   r-   r,   )r   r   r   r   r    X   s    z&TooManySymbolOrPunctuationPlugin.resetc             C   s0   | j dkrdS | j| j | j  }|dkr,|S dS )Nr   g        g333333?)r-   r+   r,   )r   Zratio_of_punctuationr   r   r   r!   ]   s
    

z&TooManySymbolOrPunctuationPlugin.ratio)r"   r#   r$   r/   r&   r'   r   r   r    r(   r)   r!   r   r   r   r   r*   :   s   r*   c               @   sZ   e Zd ZddddZeedddZedddd	Zddd
dZe	e
dddZdS )TooManyAccentuatedPluginN)r   c             C   s   d| _ d| _d S )Nr   )r-   _accentuated_count)r   r   r   r   r/   j   s    z!TooManyAccentuatedPlugin.__init__)r   r   c             C   s   |j  S )N)isalpha)r   r   r   r   r   r   n   s    z!TooManyAccentuatedPlugin.eligiblec             C   s(   |  j d7  _ t|r$|  jd7  _d S )Nr   )r-   r   r4   )r   r   r   r   r   r   q   s    zTooManyAccentuatedPlugin.feedc             C   s   d| _ d| _d S )Nr   )r-   r4   )r   r   r   r   r    w   s    zTooManyAccentuatedPlugin.resetc             C   s*   | j dkrdS | j| j  }|dkr&|S dS )Nr   g        gffffff?)r-   r4   )r   Zratio_of_accentuationr   r   r   r!   {   s    
zTooManyAccentuatedPlugin.ratio)r"   r#   r$   r/   r&   r'   r   r   r    r(   r)   r!   r   r   r   r   r3   i   s   r3   c               @   sZ   e Zd ZddddZeedddZedddd	Zddd
dZe	e
dddZdS )UnprintablePluginN)r   c             C   s   d| _ d| _d S )Nr   )_unprintable_countr-   )r   r   r   r   r/      s    zUnprintablePlugin.__init__)r   r   c             C   s   dS )NTr   )r   r   r   r   r   r      s    zUnprintablePlugin.eligiblec             C   s@   |j  dkr.|j dkr.|dkr.|  jd7  _|  jd7  _d S )NFr   )isspacer0   r7   r-   )r   r   r   r   r   r      s
    zUnprintablePlugin.feedc             C   s
   d| _ d S )Nr   )r7   )r   r   r   r   r       s    zUnprintablePlugin.resetc             C   s   | j dkrdS | jd | j  S )Nr   g           )r-   r7   )r   r   r   r   r!      s    
zUnprintablePlugin.ratio)r"   r#   r$   r/   r&   r'   r   r   r    r(   r)   r!   r   r   r   r   r6      s   	r6   c               @   sZ   e Zd ZddddZeedddZedddd	Zddd
dZe	e
dddZdS )SuspiciousDuplicateAccentPluginN)r   c             C   s   d| _ d| _d | _d S )Nr   )_successive_countr-   _last_latin_character)r   r   r   r   r/      s    z(SuspiciousDuplicateAccentPlugin.__init__)r   r   c             C   s   |j  ot|S )N)r5   r   )r   r   r   r   r   r      s    z(SuspiciousDuplicateAccentPlugin.eligiblec             C   st   |  j d7  _ | jd k	rjt|rjt| jrj|j rJ| jj rJ|  jd7  _t|t| jkrj|  jd7  _|| _d S )Nr   )r-   r=   r   isupperr<   r   )r   r   r   r   r   r      s    

z$SuspiciousDuplicateAccentPlugin.feedc             C   s   d| _ d| _d | _d S )Nr   )r<   r-   r=   )r   r   r   r   r       s    z%SuspiciousDuplicateAccentPlugin.resetc             C   s   | j dkrdS | jd | j  S )Nr   g        r1   )r-   r<   )r   r   r   r   r!      s    
z%SuspiciousDuplicateAccentPlugin.ratio)r"   r#   r$   r/   r&   r'   r   r   r    r(   r)   r!   r   r   r   r   r;      s   r;   c               @   sZ   e Zd ZddddZeedddZedddd	Zddd
dZe	e
dddZdS )SuspiciousRangeN)r   c             C   s   d| _ d| _d | _d S )Nr   )"_suspicious_successive_range_countr-   _last_printable_seen)r   r   r   r   r/      s    zSuspiciousRange.__init__)r   r   c             C   s   |j  S )N)r0   )r   r   r   r   r   r      s    zSuspiciousRange.eligiblec             C   sx   |  j d7  _ |j s&t|s&|tkr0d | _d S | jd krD|| _d S t| j}t|}t||rn|  jd7  _|| _d S )Nr   )r-   r9   r   r   rA   r    is_suspiciously_successive_ranger@   )r   r   unicode_range_aunicode_range_br   r   r   r      s    

zSuspiciousRange.feedc             C   s   d| _ d| _d | _d S )Nr   )r-   r@   rA   )r   r   r   r   r       s    zSuspiciousRange.resetc             C   s.   | j dkrdS | jd | j  }|dk r*dS |S )Nr   g        r1   g?)r-   r@   )r   Zratio_of_suspicious_range_usager   r   r   r!      s    
zSuspiciousRange.ratio)r"   r#   r$   r/   r&   r'   r   r   r    r(   r)   r!   r   r   r   r   r?      s   r?   c               @   sZ   e Zd ZddddZeedddZedddd	Zddd
dZe	e
dddZdS )SuperWeirdWordPluginN)r   c             C   s:   d| _ d| _d| _d| _d| _d| _d| _d| _d| _d S )Nr   F )	_word_count_bad_word_count_foreign_long_count_is_current_word_bad_foreign_long_watchr-   _bad_character_count_buffer_buffer_accent_count)r   r   r   r   r/      s    zSuperWeirdWordPlugin.__init__)r   r   c             C   s   dS )NTr   )r   r   r   r   r   r     s    zSuperWeirdWordPlugin.eligiblec             C   s  |j  rdj| j|g| _t|r0|  jd7  _| jdkrt|dksNt|rt|dkrt|dkrt	|dkrt
|dkrt|dkrd| _d S | jsd S |j st|st|o| jr|  jd7  _t| j}|  j|7  _|dkr8| j| dkrd| _t| jd r8| jd j r8|  jd7  _d| _|dkr^| jr^|  jd7  _d| _| jr|  jd7  _|  jt| j7  _d| _d| _d| _d| _n6|dkr|j dkrt|rd| _|  j|7  _d S )NrF   r   FT   g(\?   r   <>-=~|_rX   >   rS   rW   rU   rR   rT   rQ   rV   )r5   joinrM   r   rN   rK   r   r   r   r   r   r   r9   r   r   rG   lenr-   rJ   r>   rI   rH   rL   r2   r   )r   r   Zbuffer_lengthr   r   r   r     sR    


 

zSuperWeirdWordPlugin.feedc             C   s4   d| _ d| _d| _d| _d| _d| _d| _d| _d S )NrF   Fr   )rM   rJ   rK   rH   rG   r-   rL   rI   )r   r   r   r   r    D  s    zSuperWeirdWordPlugin.resetc             C   s$   | j dkr| jdkrdS | j| j S )N
   r   g        )rG   rI   rL   r-   )r   r   r   r   r!   N  s    zSuperWeirdWordPlugin.ratio)r"   r#   r$   r/   r&   r'   r   r   r    r(   r)   r!   r   r   r   r   rE      s   6
rE   c               @   s^   e Zd ZdZddddZeedddZeddd	d
ZddddZ	e
edddZdS )CjkInvalidStopPluginu   
    GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
    can be easily detected. Searching for the overuse of '丅' and '丄'.
    N)r   c             C   s   d| _ d| _d S )Nr   )_wrong_stop_count_cjk_character_count)r   r   r   r   r/   \  s    zCjkInvalidStopPlugin.__init__)r   r   c             C   s   dS )NTr   )r   r   r   r   r   r   `  s    zCjkInvalidStopPlugin.eligiblec             C   s4   |dkr|  j d7  _ d S t|r0|  jd7  _d S )N   丅   丄r   >   r_   r`   )r]   r   r^   )r   r   r   r   r   r   c  s
    zCjkInvalidStopPlugin.feedc             C   s   d| _ d| _d S )Nr   )r]   r^   )r   r   r   r   r    j  s    zCjkInvalidStopPlugin.resetc             C   s   | j dk rdS | j| j  S )N   g        )r^   r]   )r   r   r   r   r!   n  s    
zCjkInvalidStopPlugin.ratio)r"   r#   r$   r%   r/   r&   r'   r   r   r    r(   r)   r!   r   r   r   r   r\   V  s   r\   c               @   sZ   e Zd ZddddZeedddZedddd	Zddd
dZe	e
dddZdS )ArchaicUpperLowerPluginN)r   c             C   s.   d| _ d| _d| _d| _d| _d | _d| _d S )NFr   T)_buf_character_count_since_last_sep_successive_upper_lower_count#_successive_upper_lower_count_finalr-   _last_alpha_seen_current_ascii_only)r   r   r   r   r/   v  s    z ArchaicUpperLowerPlugin.__init__)r   r   c             C   s   dS )NTr   )r   r   r   r   r   r     s    z ArchaicUpperLowerPlugin.eligiblec             C   s$  |j  ot|}|dk}|r| jdkr| jdkrV|j dkrV| jdkrV|  j| j7  _d| _d| _d | _d| _|  j	d7  _	d| _d S | jdkrt
|dkrd| _| jd k	r|j r| jj s|j r| jj r| jdkr|  jd7  _d| _qd| _nd| _|  j	d7  _	|  jd7  _|| _d S )NFr   @   r   Tr1   )r5   r
   rd   r2   rh   rf   re   rg   rc   r-   r	   r>   islower)r   r   Zis_concernedZ	chunk_sepr   r   r   r     s8    




zArchaicUpperLowerPlugin.feedc             C   s.   d| _ d| _d| _d| _d | _d| _d| _d S )Nr   FT)r-   rd   re   rf   rg   rc   rh   )r   r   r   r   r      s    zArchaicUpperLowerPlugin.resetc             C   s   | j dkrdS | j| j  S )Nr   g        )r-   rf   )r   r   r   r   r!     s    
zArchaicUpperLowerPlugin.ratio)r"   r#   r$   r/   r&   r'   r   r   r    r(   r)   r!   r   r   r   r   rb   u  s   *	rb   )rC   rD   r   c             C   s~  | dks|dkrdS | |kr dS d| kr4d|kr4dS d| ksDd|krHdS d| ksXd|krld| kshd|krldS | j d|j d }}x"|D ]}|tkrq||krdS qW | dk|dk }}|s|rd
| ksd
|krdS |r|rdS d| ksd|kr"d
| ksd
|kr
dS | dks|dkr"dS d
| ksJd
|ksJ| dkrz|dkrzd| ks^d|krbdS d| ksvd|krzdS dS )za
    Determine if two Unicode range seen next to each other can be considered as suspicious.
    NTFZLatinZ	EmoticonsZ	Combining HiraganaKatakanaCJKZHangulzBasic LatinZPunctuationZForms)rl   rm   )rl   rm   )rm   rl   )rm   rl   )splitr   )rC   rD   Zkeywords_range_aZkeywords_range_belZrange_a_jp_charsZrange_b_jp_charsr   r   r   rB     sR    
 

rB   i   )maxsize皙?F)decoded_sequencemaximum_thresholddebugr   c             C   s   dd t j D }t| d }d}|dk r0d}n|dkr>d}nd	}x|t| d
 t|D ]f\}}x |D ]}	|	j|rd|	j| qdW |dkr|| dks||d krVtdd |D }||krVP qVW |rx|D ]}
t|
j	|
j
 qW t|dS )zw
    Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
    c             S   s   g | ]
}| qS r   r   ).0Zmd_classr   r   r   
<listcomp>  s    zmess_ratio.<locals>.<listcomp>r   g        i       i   ri      
r   c             s   s   | ]}|j V  qd S )N)r!   )rv   dtr   r   r   	<genexpr>&  s    zmess_ratio.<locals>.<genexpr>   )r   __subclasses__rZ   zipranger   r   sumprint	__class__r!   round)rs   rt   ru   Z	detectorslengthZmean_mess_ratioZ!intermediary_mean_mess_ratio_calcr   indexdetectorr{   r   r   r   
mess_ratio  s*    	


r   N)rr   F)&	functoolsr   typingr   r   Zconstantr   r   utilsr   r	   r
   r   r   r   r   r   r   r   r   r   r   r   r   r   r*   r3   r6   r;   r?   rE   r\   rb   r&   r'   rB   r)   r   r   r   r   r   <module>   s"   D"/%6ZMD