o
    'hN                     @  sV  d dl mZ d dlmZ d dlmZ ddlmZmZm	Z	 ddl
mZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZ G dd dZG d	d
 d
eZG dd deZG dd deZ G dd deZ!G dd deZ"G dd deZ#G dd deZ$G dd deZ%G dd deZ&eddd/d"d#Z'ed$d	&d0d1d,d-Z(d.S )2    )annotations)	lru_cache)	getLogger   )COMMON_SAFE_ASCII_CHARACTERSTRACEUNICODE_SECONDARY_RANGE_KEYWORD)is_accentuated	is_arabicis_arabic_isolated_formis_case_variableis_cjkis_emoticon	is_hangulis_hiraganais_katakanais_latinis_punctuationis_separator	is_symbolis_thaiis_unprintableremove_accentunicode_rangeis_cjk_uncommonc                   @  s<   e Zd ZdZdddZdd	d
ZdddZedddZdS )MessDetectorPluginzy
    Base abstract class used for mess detection plugins.
    All detectors MUST extend and implement given methods.
    	characterstrreturnboolc                 C     t )z@
        Determine if given character should be fed in.
        NotImplementedErrorselfr    r%   Q/var/www/html/olx_land/venv/lib/python3.10/site-packages/charset_normalizer/md.pyeligible'      zMessDetectorPlugin.eligibleNonec                 C  r    )z
        The main routine to be executed upon character.
        Insert the logic in witch the text would be considered chaotic.
        r!   r#   r%   r%   r&   feed-   s   zMessDetectorPlugin.feedc                 C  r    )zB
        Permit to reset the plugin to the initial state.
        r!   r$   r%   r%   r&   reset4   r(   zMessDetectorPlugin.resetfloatc                 C  r    )z
        Compute the chaos ratio based on what your feed() has seen.
        Must NOT be lower than 0.; No restriction gt 0.
        r!   r+   r%   r%   r&   ratio:   s   zMessDetectorPlugin.ratioNr   r   r   r   r   r   r   r)   r   r)   r   r-   )	__name__
__module____qualname____doc__r'   r*   r,   propertyr.   r%   r%   r%   r&   r   !   s    


r   c                   @  B   e Zd ZdddZddd	Zdd
dZdddZedddZdS ) TooManySymbolOrPunctuationPluginr   r)   c                 C  s"   d| _ d| _d| _d | _d| _d S )Nr   F)_punctuation_count_symbol_count_character_count_last_printable_char_frenzy_symbol_in_wordr+   r%   r%   r&   __init__D   s
   
z)TooManySymbolOrPunctuationPlugin.__init__r   r   r   c                 C     |  S Nisprintabler#   r%   r%   r&   r'   L      z)TooManySymbolOrPunctuationPlugin.eligiblec                 C  sp   |  j d7  _ || jkr3|tvr3t|r|  jd7  _n| du r3t|r3t|du r3|  jd7  _|| _d S )Nr   F   )	r<   r=   r   r   r:   isdigitr   r   r;   r#   r%   r%   r&   r*   O   s   

z%TooManySymbolOrPunctuationPlugin.feedc                 C  s   d| _ d| _d| _d S Nr   )r:   r<   r;   r+   r%   r%   r&   r,   a      
z&TooManySymbolOrPunctuationPlugin.resetr-   c                 C  s0   | j dkrdS | j| j | j  }|dkr|S dS )Nr           333333?)r<   r:   r;   )r$   ratio_of_punctuationr%   r%   r&   r.   f   s   

z&TooManySymbolOrPunctuationPlugin.ratioNr1   r/   r0   r2   	r3   r4   r5   r?   r'   r*   r,   r7   r.   r%   r%   r%   r&   r9   C   s    



r9   c                   @  r8   )TooManyAccentuatedPluginr   r)   c                 C     d| _ d| _d S rG   r<   _accentuated_countr+   r%   r%   r&   r?   s      
z!TooManyAccentuatedPlugin.__init__r   r   r   c                 C  r@   rA   )isalphar#   r%   r%   r&   r'   w   rD   z!TooManyAccentuatedPlugin.eligiblec                 C  ,   |  j d7  _ t|r|  jd7  _d S d S Nr   )r<   r	   rP   r#   r%   r%   r&   r*   z      zTooManyAccentuatedPlugin.feedc                 C  rN   rG   rO   r+   r%   r%   r&   r,      rQ   zTooManyAccentuatedPlugin.resetr-   c                 C  s*   | j dk rdS | j| j  }|dkr|S dS )N   rI   gffffff?rO   )r$   ratio_of_accentuationr%   r%   r&   r.      s   
zTooManyAccentuatedPlugin.ratioNr1   r/   r0   r2   rL   r%   r%   r%   r&   rM   r   s    



rM   c                   @  r8   )UnprintablePluginr   r)   c                 C  rN   rG   )_unprintable_countr<   r+   r%   r%   r&   r?      rQ   zUnprintablePlugin.__init__r   r   r   c                 C     dS NTr%   r#   r%   r%   r&   r'         zUnprintablePlugin.eligiblec                 C  s(   t |r|  jd7  _|  jd7  _d S rT   )r   rY   r<   r#   r%   r%   r&   r*      s   zUnprintablePlugin.feedc                 C  s
   d| _ d S rG   )rY   r+   r%   r%   r&   r,      s   
zUnprintablePlugin.resetr-   c                 C     | j dkrdS | jd | j  S )Nr   rI   rV   )r<   rY   r+   r%   r%   r&   r.         
zUnprintablePlugin.ratioNr1   r/   r0   r2   rL   r%   r%   r%   r&   rX      s    



rX   c                   @  r8   )SuspiciousDuplicateAccentPluginr   r)   c                 C     d| _ d| _d | _d S rG   _successive_countr<   _last_latin_characterr+   r%   r%   r&   r?      s   
z(SuspiciousDuplicateAccentPlugin.__init__r   r   r   c                 C  s   |  ot|S rA   )rR   r   r#   r%   r%   r&   r'      s   z(SuspiciousDuplicateAccentPlugin.eligiblec                 C  st   |  j d7  _ | jd ur5t|r5t| jr5| r%| j r%|  jd7  _t|t| jkr5|  jd7  _|| _d S rT   )r<   rc   r	   isupperrb   r   r#   r%   r%   r&   r*      s   

z$SuspiciousDuplicateAccentPlugin.feedc                 C  r`   rG   ra   r+   r%   r%   r&   r,      rH   z%SuspiciousDuplicateAccentPlugin.resetr-   c                 C  r]   )Nr   rI   rE   )r<   rb   r+   r%   r%   r&   r.      r^   z%SuspiciousDuplicateAccentPlugin.ratioNr1   r/   r0   r2   rL   r%   r%   r%   r&   r_      s    



r_   c                   @  r8   )SuspiciousRanger   r)   c                 C  r`   rG   )"_suspicious_successive_range_countr<   _last_printable_seenr+   r%   r%   r&   r?      rH   zSuspiciousRange.__init__r   r   r   c                 C  r@   rA   rB   r#   r%   r%   r&   r'      rD   zSuspiciousRange.eligiblec                 C  sx   |  j d7  _ | st|s|tv rd | _d S | jd u r"|| _d S t| j}t|}t||r7|  jd7  _|| _d S rT   )r<   isspacer   r   rg   r    is_suspiciously_successive_rangerf   )r$   r   unicode_range_aunicode_range_br%   r%   r&   r*      s    



zSuspiciousRange.feedc                 C  r`   rG   )r<   rf   rg   r+   r%   r%   r&   r,      rH   zSuspiciousRange.resetr-   c                 C  s"   | j dkrdS | jd | j  }|S )N   rI   rE   )r<   rf   )r$   ratio_of_suspicious_range_usager%   r%   r&   r.      s   
zSuspiciousRange.ratioNr1   r/   r0   r2   rL   r%   r%   r%   r&   re      s    



re   c                   @  r8   )SuperWeirdWordPluginr   r)   c                 C  s@   d| _ d| _d| _d| _d| _d| _d| _d| _d| _d| _	d S )Nr   F )
_word_count_bad_word_count_foreign_long_count_is_current_word_bad_foreign_long_watchr<   _bad_character_count_buffer_buffer_accent_count_buffer_glyph_countr+   r%   r%   r&   r?      s   
zSuperWeirdWordPlugin.__init__r   r   r   c                 C  rZ   r[   r%   r#   r%   r%   r&   r'     r\   zSuperWeirdWordPlugin.eligiblec                 C  s  |  rc|  j|7  _t|r|  jd7  _| jdu rFt|du s%t|rFt|du rFt|du rFt|du rFt	|du rFt
|du rFd| _t|sZt|sZt|sZt	|sZt
|ra|  jd7  _d S | jshd S | sut|sut|r$| jr$|  jd7  _t| j}|  j|7  _|dkr| j| dkrd| _n4t| jd r| jd  rtdd | jD du r|  jd7  _d| _n| jdkrd| _|  jd7  _|d	kr| jrd
d t| jtd|D }d}|rt|| dkrd}|s|  jd7  _d| _| jr|  jd7  _|  jt| j7  _d| _d| _d| _d| _d| _d S |dvrA| du rCt|rEd| _|  j|7  _d S d S d S d S )Nr   FT         ?c                 s  s    | ]}|  V  qd S rA   rd   ).0_r%   r%   r&   	<genexpr>8  s    z,SuperWeirdWordPlugin.feed.<locals>.<genexpr>   c                 S  s   g | ]
\}}|  r|qS r%   r|   )r}   cir%   r%   r&   
<listcomp>@  s    z-SuperWeirdWordPlugin.feed.<locals>.<listcomp>r   rJ   ro   >   r~   -<=>|~)rR   rv   r	   rw   rt   r   r   r   r   r   r   rx   rh   r   r   rp   lenr<   rs   rd   allrr   ziprangerq   ru   rF   r   )r$   r   buffer_lengthcamel_case_dstprobable_camel_casedr%   r%   r&   r*     s   




zSuperWeirdWordPlugin.feedc                 C  s4   d| _ d| _d| _d| _d| _d| _d| _d| _d S )Nro   Fr   )rv   rs   rt   rq   rp   r<   ru   rr   r+   r%   r%   r&   r,   _  s   
zSuperWeirdWordPlugin.resetr-   c                 C  s$   | j dkr| jdkrdS | j| j S )N
   r   rI   )rp   rr   ru   r<   r+   r%   r%   r&   r.   i  s   zSuperWeirdWordPlugin.ratioNr1   r/   r0   r2   rL   r%   r%   r%   r&   rn      s    



Q
rn   c                   @  sF   e Zd ZdZdddZdd	d
ZdddZdddZedddZ	dS )CjkUncommonPluginz<
    Detect messy CJK text that probably means nothing.
    r   r)   c                 C  rN   rG   r<   _uncommon_countr+   r%   r%   r&   r?   v  rQ   zCjkUncommonPlugin.__init__r   r   r   c                 C     t |S rA   )r   r#   r%   r%   r&   r'   z  rD   zCjkUncommonPlugin.eligiblec                 C  rS   rT   )r<   r   r   r#   r%   r%   r&   r*   }  s
   zCjkUncommonPlugin.feedc                 C  rN   rG   r   r+   r%   r%   r&   r,     rQ   zCjkUncommonPlugin.resetr-   c                 C  s.   | j dk rdS | j| j  }|dkr|d S dS )NrV   rI   rz   r   r   )r$   uncommon_form_usager%   r%   r&   r.     s   
zCjkUncommonPlugin.ratioNr1   r/   r0   r2   )
r3   r4   r5   r6   r?   r'   r*   r,   r7   r.   r%   r%   r%   r&   r   q  s    



r   c                   @  r8   )ArchaicUpperLowerPluginr   r)   c                 C  s.   d| _ d| _d| _d| _d| _d | _d| _d S )NFr   T)_buf_character_count_since_last_sep_successive_upper_lower_count#_successive_upper_lower_count_finalr<   _last_alpha_seen_current_ascii_onlyr+   r%   r%   r&   r?     s   
z ArchaicUpperLowerPlugin.__init__r   r   r   c                 C  rZ   r[   r%   r#   r%   r%   r&   r'     r\   z ArchaicUpperLowerPlugin.eligiblec                 C  s$  |  ot|}|du }|rC| jdkrC| jdkr+| du r+| jdu r+|  j| j7  _d| _d| _d | _d| _|  j	d7  _	d| _d S | jdu rQ|
 du rQd| _| jd ur| r_| j sh| r|| j r|| jdu rx|  jd7  _d| _nd| _nd| _|  j	d7  _	|  jd7  _|| _d S )NFr   @   r   TrE   )rR   r   r   rF   r   r   r   r   r   r<   isasciird   islower)r$   r   is_concerned	chunk_sepr%   r%   r&   r*     s@   




zArchaicUpperLowerPlugin.feedc                 C  s.   d| _ d| _d| _d| _d | _d| _d| _d S )Nr   FT)r<   r   r   r   r   r   r   r+   r%   r%   r&   r,     s   
zArchaicUpperLowerPlugin.resetr-   c                 C  s   | j dkrdS | j| j  S )Nr   rI   )r<   r   r+   r%   r%   r&   r.     s   
zArchaicUpperLowerPlugin.ratioNr1   r/   r0   r2   rL   r%   r%   r%   r&   r     s    



*	r   c                   @  sB   e Zd ZdddZdddZdd
dZdddZedddZdS )ArabicIsolatedFormPluginr   r)   c                 C  rN   rG   r<   _isolated_form_countr+   r%   r%   r&   r?     rQ   z!ArabicIsolatedFormPlugin.__init__c                 C  rN   rG   r   r+   r%   r%   r&   r,     rQ   zArabicIsolatedFormPlugin.resetr   r   r   c                 C  r   rA   )r
   r#   r%   r%   r&   r'     rD   z!ArabicIsolatedFormPlugin.eligiblec                 C  rS   rT   )r<   r   r   r#   r%   r%   r&   r*     rU   zArabicIsolatedFormPlugin.feedr-   c                 C  s   | j dk rdS | j| j  }|S )NrV   rI   r   )r$   isolated_form_usager%   r%   r&   r.     s   
zArabicIsolatedFormPlugin.ratioNr1   r/   r0   r2   )	r3   r4   r5   r?   r,   r'   r*   r7   r.   r%   r%   r%   r&   r     s    



r      )maxsizerj   
str | Nonerk   r   r   c                 C  sv  | du s|du r
dS | |krdS d| v rd|v rdS d| v s"d|v r$dS d| v s,d|v r6d| v s4d|v r6dS |  d| d}}|D ]}|tv rJqC||v rQ dS qC| dv |dv }}|s_|rid	| v sgd	|v ridS |ro|rodS d
| v swd
|v rd	| v sd	|v rdS | dks|dkrdS d	| v sd	|v s| dv r|dv rd| v sd|v rdS d| v sd|v rdS | dks|dkrdS dS )za
    Determine if two Unicode range seen next to each other can be considered as suspicious.
    NTFLatin	Emoticons	Combining )HiraganaKatakanaCJKHangulzBasic Latin)r   r   PunctuationForms)splitr   )rj   rk   keywords_range_akeywords_range_belrange_a_jp_charsrange_b_jp_charsr%   r%   r&   ri     sZ   ri   i   皙?Fdecoded_sequencer   maximum_thresholdr-   debugc              	   C  sR  dd t  D }t| d }d}|dk rd}n	|dkrd}nd	}t| d
 t|D ]2\}}|D ]}	|	|r<|	| q0|dkrG|| dksM||d kr\tdd |D }||kr\ nq*|rtd}
|
	t
d| d| d|  t| dkr|
	t
d| dd   |
	t
d| dd   |D ]}|
	t
|j d|j  qt|dS )zw
    Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
    c                 S  s   g | ]}| qS r%   r%   )r}   md_classr%   r%   r&   r   N  s    zmess_ratio.<locals>.<listcomp>r   rI   i       r   r      
r   c                 s  s    | ]}|j V  qd S rA   )r.   )r}   dtr%   r%   r&   r   e  s    zmess_ratio.<locals>.<genexpr>charset_normalizerzIMess-detector extended-analysis start. intermediary_mean_mess_ratio_calc=z mean_mess_ratio=z maximum_threshold=   zStarting with: NzEnding with: iz:    )r   __subclasses__r   r   r   r'   r*   sumr   logr   	__class__r.   round)r   r   r   	detectorslengthmean_mess_ratio!intermediary_mean_mess_ratio_calcr   indexdetectorloggerr   r%   r%   r&   
mess_ratioF  sN   


r   N)rj   r   rk   r   r   r   )r   F)r   r   r   r-   r   r   r   r-   ))
__future__r   	functoolsr   loggingr   constantr   r   r   utilsr	   r
   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r9   rM   rX   r_   re   rn   r   r   r   ri   r   r%   r%   r%   r&   <module>   s(    P"/%1v#LI