U
    kufL                     @   sn  d dl mZ d dlmZ d dlmZmZ ddlmZm	Z	m
Z
 ddlmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZ G dd dZG d	d
 d
eZG dd deZG dd deZ G dd deZ!G dd deZ"G dd deZ#G dd deZ$G dd deZ%G dd deZ&eddee' ee' e(dddZ)ed dd'e'e*e(e*d#d$d%Z+d&S )(    )	lru_cache)	getLogger)ListOptional   )COMMON_SAFE_ASCII_CHARACTERSTRACEUNICODE_SECONDARY_RANGE_KEYWORD)is_accentuated	is_arabicis_arabic_isolated_formis_case_variableis_cjkis_emoticon	is_hangulis_hiraganais_katakanais_latinis_punctuationis_separator	is_symbolis_thaiis_unprintableremove_accentunicode_rangec                   @   sP   e Zd ZdZeedddZeddddZddd	d
Ze	e
dddZdS )MessDetectorPluginzy
    Base abstract class used for mess detection plugins.
    All detectors MUST extend and implement given methods.
    	characterreturnc                 C   s   t dS )z@
        Determine if given character should be fed in.
        NNotImplementedErrorselfr    r#   9/tmp/pip-unpacked-wheel-fc9lr3jv/charset_normalizer/md.pyeligible%   s    zMessDetectorPlugin.eligibleNc                 C   s   t dS )z
        The main routine to be executed upon character.
        Insert the logic in witch the text would be considered chaotic.
        Nr   r!   r#   r#   r$   feed+   s    zMessDetectorPlugin.feedr   c                 C   s   t dS )zB
        Permit to reset the plugin to the initial state.
        Nr   r"   r#   r#   r$   reset2   s    zMessDetectorPlugin.resetc                 C   s   t dS )z
        Compute the chaos ratio based on what your feed() has seen.
        Must NOT be lower than 0.; No restriction gt 0.
        Nr   r(   r#   r#   r$   ratio8   s    zMessDetectorPlugin.ratio)__name__
__module____qualname____doc__strboolr%   r&   r)   propertyfloatr*   r#   r#   r#   r$   r      s   r   c                   @   sZ   e Zd ZddddZeedddZedddd	Zddd
dZe	e
dddZdS ) TooManySymbolOrPunctuationPluginNr'   c                 C   s"   d| _ d| _d| _d | _d| _d S )Nr   F)_punctuation_count_symbol_count_character_count_last_printable_charZ_frenzy_symbol_in_wordr(   r#   r#   r$   __init__B   s
    z)TooManySymbolOrPunctuationPlugin.__init__r   c                 C   s   |  S Nisprintabler!   r#   r#   r$   r%   J   s    z)TooManySymbolOrPunctuationPlugin.eligiblec                 C   sp   |  j d7  _ || jkrf|tkrft|r8|  jd7  _n.| dkrft|rft|dkrf|  jd7  _|| _d S )Nr   F   )	r6   r7   r   r   r4   isdigitr   r   r5   r!   r#   r#   r$   r&   M   s    

z%TooManySymbolOrPunctuationPlugin.feedc                 C   s   d| _ d| _d| _d S Nr   )r4   r6   r5   r(   r#   r#   r$   r)   _   s    z&TooManySymbolOrPunctuationPlugin.resetc                 C   s0   | j dkrdS | j| j | j  }|dkr,|S dS )Nr           333333?)r6   r4   r5   )r"   Zratio_of_punctuationr#   r#   r$   r*   d   s    

z&TooManySymbolOrPunctuationPlugin.ratior+   r,   r-   r8   r/   r0   r%   r&   r)   r1   r2   r*   r#   r#   r#   r$   r3   A   s   r3   c                   @   sZ   e Zd ZddddZeedddZedddd	Zddd
dZe	e
dddZdS )TooManyAccentuatedPluginNr'   c                 C   s   d| _ d| _d S r>   r6   _accentuated_countr(   r#   r#   r$   r8   q   s    z!TooManyAccentuatedPlugin.__init__r   c                 C   s   |  S r9   )isalphar!   r#   r#   r$   r%   u   s    z!TooManyAccentuatedPlugin.eligiblec                 C   s(   |  j d7  _ t|r$|  jd7  _d S Nr   )r6   r
   rD   r!   r#   r#   r$   r&   x   s    zTooManyAccentuatedPlugin.feedc                 C   s   d| _ d| _d S r>   rC   r(   r#   r#   r$   r)   ~   s    zTooManyAccentuatedPlugin.resetc                 C   s*   | j dk rdS | j| j  }|dkr&|S dS )N   r?   gffffff?rC   )r"   Zratio_of_accentuationr#   r#   r$   r*      s    
zTooManyAccentuatedPlugin.ratiorA   r#   r#   r#   r$   rB   p   s   rB   c                   @   sZ   e Zd ZddddZeedddZedddd	Zddd
dZe	e
dddZdS )UnprintablePluginNr'   c                 C   s   d| _ d| _d S r>   )_unprintable_countr6   r(   r#   r#   r$   r8      s    zUnprintablePlugin.__init__r   c                 C   s   dS NTr#   r!   r#   r#   r$   r%      s    zUnprintablePlugin.eligiblec                 C   s(   t |r|  jd7  _|  jd7  _d S rF   )r   rI   r6   r!   r#   r#   r$   r&      s    zUnprintablePlugin.feedc                 C   s
   d| _ d S r>   )rI   r(   r#   r#   r$   r)      s    zUnprintablePlugin.resetc                 C   s   | j dkrdS | jd | j  S )Nr   r?   rG   )r6   rI   r(   r#   r#   r$   r*      s    
zUnprintablePlugin.ratiorA   r#   r#   r#   r$   rH      s   rH   c                   @   sZ   e Zd ZddddZeedddZedddd	Zddd
dZe	e
dddZdS )SuspiciousDuplicateAccentPluginNr'   c                 C   s   d| _ d| _d | _d S r>   _successive_countr6   _last_latin_characterr(   r#   r#   r$   r8      s    z(SuspiciousDuplicateAccentPlugin.__init__r   c                 C   s   |  ot|S r9   )rE   r   r!   r#   r#   r$   r%      s    z(SuspiciousDuplicateAccentPlugin.eligiblec                 C   st   |  j d7  _ | jd k	rjt|rjt| jrj| rJ| j rJ|  jd7  _t|t| jkrj|  jd7  _|| _d S rF   )r6   rN   r
   isupperrM   r   r!   r#   r#   r$   r&      s    z$SuspiciousDuplicateAccentPlugin.feedc                 C   s   d| _ d| _d | _d S r>   rL   r(   r#   r#   r$   r)      s    z%SuspiciousDuplicateAccentPlugin.resetc                 C   s   | j dkrdS | jd | j  S )Nr   r?   r<   )r6   rM   r(   r#   r#   r$   r*      s    
z%SuspiciousDuplicateAccentPlugin.ratiorA   r#   r#   r#   r$   rK      s   rK   c                   @   sZ   e Zd ZddddZeedddZedddd	Zddd
dZe	e
dddZdS )SuspiciousRangeNr'   c                 C   s   d| _ d| _d | _d S r>   )"_suspicious_successive_range_countr6   _last_printable_seenr(   r#   r#   r$   r8      s    zSuspiciousRange.__init__r   c                 C   s   |  S r9   r:   r!   r#   r#   r$   r%      s    zSuspiciousRange.eligiblec                 C   sx   |  j d7  _ | s&t|s&|tkr0d | _d S | jd krD|| _d S t| j}t|}t||rn|  jd7  _|| _d S rF   )r6   isspacer   r   rR   r    is_suspiciously_successive_rangerQ   )r"   r   unicode_range_aunicode_range_br#   r#   r$   r&      s"    


zSuspiciousRange.feedc                 C   s   d| _ d| _d | _d S r>   )r6   rQ   rR   r(   r#   r#   r$   r)      s    zSuspiciousRange.resetc                 C   s"   | j dkrdS | jd | j  }|S )N   r?   r<   )r6   rQ   )r"   Zratio_of_suspicious_range_usager#   r#   r$   r*      s    
zSuspiciousRange.ratiorA   r#   r#   r#   r$   rP      s   rP   c                   @   sZ   e Zd ZddddZeedddZedddd	Zddd
dZe	e
dddZdS )SuperWeirdWordPluginNr'   c                 C   s:   d| _ d| _d| _d| _d| _d| _d| _d| _d| _d S )Nr   F )	_word_count_bad_word_count_foreign_long_count_is_current_word_bad_foreign_long_watchr6   _bad_character_count_buffer_buffer_accent_countr(   r#   r#   r$   r8      s    zSuperWeirdWordPlugin.__init__r   c                 C   s   dS rJ   r#   r!   r#   r#   r$   r%     s    zSuperWeirdWordPlugin.eligiblec                 C   s6  |  r|  j|7  _t|r,|  jd7  _| jdkrt|dksJt|rt|dkrt|dkrt|dkrt	|dkrt
|dkrd| _d S | jsd S | st|st|r| jr|  jd7  _t| j}|  j|7  _|dkrP| j| dkrd| _t| jd rP| jd  rPtdd | jD dkrP|  jd7  _d| _|d	kr| jrd
d t| jtd|D }d}|rt|| dkrd}|s|  jd7  _d| _| jr|  jd7  _|  jt| j7  _d| _d| _d| _d| _n6|dkr2| dkr2t|r2d| _|  j|7  _d S )Nr   FT   g(\?c                 s   s   | ]}|  V  qd S r9   rO   ).0_r#   r#   r$   	<genexpr>-  s     z,SuperWeirdWordPlugin.feed.<locals>.<genexpr>rW   c                 S   s   g | ]\}}|  r|qS r#   rd   )re   cir#   r#   r$   
<listcomp>2  s   z-SuperWeirdWordPlugin.feed.<locals>.<listcomp>r   r@   rY   >   rf   =-~>|<)rE   r`   r
   ra   r^   r   r   r   r   r   r   rS   r   r   rZ   lenr6   r]   rO   allr\   zipranger[   r_   r=   r   )r"   r   Zbuffer_lengthZcamel_case_dstZprobable_camel_casedr#   r#   r$   r&     s    





	


zSuperWeirdWordPlugin.feedc                 C   s4   d| _ d| _d| _d| _d| _d| _d| _d| _d S )NrY   Fr   )r`   r]   r^   r[   rZ   r6   r_   r\   r(   r#   r#   r$   r)   P  s    zSuperWeirdWordPlugin.resetc                 C   s$   | j dkr| jdkrdS | j| j S )N
   r   r?   )rZ   r\   r_   r6   r(   r#   r#   r$   r*   Z  s    zSuperWeirdWordPlugin.ratiorA   r#   r#   r#   r$   rX      s   E
rX   c                   @   s^   e Zd ZdZddddZeedddZeddd	d
ZddddZ	e
edddZdS )CjkInvalidStopPluginu   
    GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
    can be easily detected. Searching for the overuse of '丅' and '丄'.
    Nr'   c                 C   s   d| _ d| _d S r>   _wrong_stop_count_cjk_character_countr(   r#   r#   r$   r8   h  s    zCjkInvalidStopPlugin.__init__r   c                 C   s   dS rJ   r#   r!   r#   r#   r$   r%   l  s    zCjkInvalidStopPlugin.eligiblec                 C   s4   |dkr|  j d7  _ d S t|r0|  jd7  _d S )N>      丅   丄r   )rx   r   ry   r!   r#   r#   r$   r&   o  s
    zCjkInvalidStopPlugin.feedc                 C   s   d| _ d| _d S r>   rw   r(   r#   r#   r$   r)   v  s    zCjkInvalidStopPlugin.resetc                 C   s   | j dk rdS | j| j  S )N   r?   )ry   rx   r(   r#   r#   r$   r*   z  s    
zCjkInvalidStopPlugin.ratio)r+   r,   r-   r.   r8   r/   r0   r%   r&   r)   r1   r2   r*   r#   r#   r#   r$   rv   b  s   rv   c                   @   sZ   e Zd ZddddZeedddZedddd	Zddd
dZe	e
dddZdS )ArchaicUpperLowerPluginNr'   c                 C   s.   d| _ d| _d| _d| _d| _d | _d| _d S )NFr   T)_buf_character_count_since_last_sep_successive_upper_lower_count#_successive_upper_lower_count_finalr6   _last_alpha_seen_current_ascii_onlyr(   r#   r#   r$   r8     s    z ArchaicUpperLowerPlugin.__init__r   c                 C   s   dS rJ   r#   r!   r#   r#   r$   r%     s    z ArchaicUpperLowerPlugin.eligiblec                 C   s$  |  ot|}|dk}|r| jdkr| jdkrV| dkrV| jdkrV|  j| j7  _d| _d| _d | _d| _|  j	d7  _	d| _d S | jdkr|
 dkrd| _| jd k	r| r| j s| r| j r| jdkr|  jd7  _d| _qd| _nd| _|  j	d7  _	|  jd7  _|| _d S )NFr   @   r   Tr<   )rE   r   r   r=   r   r   r   r   r~   r6   isasciirO   islower)r"   r   Zis_concernedZ	chunk_sepr#   r#   r$   r&     sF    


zArchaicUpperLowerPlugin.feedc                 C   s.   d| _ d| _d| _d| _d | _d| _d| _d S )Nr   FT)r6   r   r   r   r   r~   r   r(   r#   r#   r$   r)     s    zArchaicUpperLowerPlugin.resetc                 C   s   | j dkrdS | j| j  S )Nr   r?   )r6   r   r(   r#   r#   r$   r*     s    
zArchaicUpperLowerPlugin.ratiorA   r#   r#   r#   r$   r}     s   *	r}   c                   @   sZ   e Zd ZddddZddddZeeddd	Zeddd
dZe	e
dddZdS )ArabicIsolatedFormPluginNr'   c                 C   s   d| _ d| _d S r>   r6   _isolated_form_countr(   r#   r#   r$   r8     s    z!ArabicIsolatedFormPlugin.__init__c                 C   s   d| _ d| _d S r>   r   r(   r#   r#   r$   r)     s    zArabicIsolatedFormPlugin.resetr   c                 C   s   t |S r9   )r   r!   r#   r#   r$   r%     s    z!ArabicIsolatedFormPlugin.eligiblec                 C   s(   |  j d7  _ t|r$|  jd7  _d S rF   )r6   r   r   r!   r#   r#   r$   r&     s    zArabicIsolatedFormPlugin.feedc                 C   s   | j dk rdS | j| j  }|S )NrG   r?   r   )r"   Zisolated_form_usager#   r#   r$   r*     s    
zArabicIsolatedFormPlugin.ratio)r+   r,   r-   r8   r)   r/   r0   r%   r&   r1   r2   r*   r#   r#   r#   r$   r     s   r      )maxsize)rU   rV   r   c                 C   s  | dks|dkrdS | |kr dS d| kr4d|kr4dS d| ksDd|krHdS d| ksXd|krld| kshd|krldS |  d| d }}|D ]}|tkrq||kr dS q| dk|dk }}|s|rd	| ksd	|krdS |r|rdS d
| ksd
|kr d	| ksd	|krdS | dks|dkr dS d	| ksHd	|ksH| dkr|dkrd| ks\d|kr`dS d| kstd|krxdS | dks|dkrdS dS )za
    Determine if two Unicode range seen next to each other can be considered as suspicious.
    NTFZLatinZ	EmoticonsZ	Combining )HiraganaKatakanaCJKZHangulzBasic Latin)r   r   ZPunctuationZForms)splitr	   )rU   rV   Zkeywords_range_aZkeywords_range_belZrange_a_jp_charsZrange_b_jp_charsr#   r#   r$   rT     sl    rT   i   皙?F)decoded_sequencemaximum_thresholddebugr   c              	   C   sX  dd t  D }t| d }d}|dk r0d}n|dkr>d}nd	}t| d
 t|D ]d\}}|D ]}	|	|r`|	| q`|dkr|| dks||d krTtdd |D }||krT qqT|rNtd}
|
	t
d| d| d|  t| dkr(|
	t
d| dd   |
	t
d| dd   |D ] }|
	t
|j d|j  q,t|dS )zw
    Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
    c                 S   s   g | ]
}| qS r#   r#   )re   Zmd_classr#   r#   r$   rj   :  s    zmess_ratio.<locals>.<listcomp>r   r?   i       r   r      
r   c                 s   s   | ]}|j V  qd S r9   )r*   )re   dtr#   r#   r$   rg   Q  s     zmess_ratio.<locals>.<genexpr>Zcharset_normalizerzIMess-detector extended-analysis start. intermediary_mean_mess_ratio_calc=z mean_mess_ratio=z maximum_threshold=r|   zStarting with: NzEnding with: iz:    )r   __subclasses__rq   rs   rt   r%   r&   sumr   logr   	__class__r*   round)r   r   r   Z	detectorslengthZmean_mess_ratioZ!intermediary_mean_mess_ratio_calcr   indexdetectorloggerr   r#   r#   r$   
mess_ratio2  sF    


r   N)r   F),	functoolsr   loggingr   typingr   r   Zconstantr   r   r	   utilsr
   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r3   rB   rH   rK   rP   rX   rv   r}   r   r/   r0   rT   r2   r   r#   r#   r#   r$   <module>   s8   L"/%1iL H     