3
eOO                 @   s  d dl Z d dlmZmZ d dlmZmZmZmZ yd dl	m
Z
 W n ek
rX   eZ
Y nX ddlmZmZmZmZ ddlmZmZmZmZ ddlmZ dd	lmZmZ dd
lmZmZmZm Z m!Z!m"Z" e j#dZ$e j% Z&e&j'e j(d de)e*e*e+ee ee e,e,ed	ddZ-dee*e*e+ee ee e,e,ed	ddZ.d e
e*e*e+ee ee e,e,ed	ddZ/d!e
e*e*e+ee ee e,edddZ0dS )"    N)basenamesplitext)BinaryIOListOptionalSet)PathLike   )coherence_ratioencoding_languagesmb_encoding_languagesmerge_coherence_ratios)IANA_SUPPORTEDTOO_BIG_SEQUENCETOO_SMALL_SEQUENCETRACE)
mess_ratio)CharsetMatchCharsetMatches)any_specified_encoding	iana_nameidentify_sig_or_bomis_cp_similaris_multi_byte_encodingshould_strip_sig_or_bomZcharset_normalizerz)%(asctime)s | %(levelname)s | %(message)s      皙?TF)		sequencessteps
chunk_size	thresholdcp_isolationcp_exclusionpreemptive_behaviourexplainreturnc       1   .   C   s	  t | ttfs tdjt| |r>tj}tjt	 tj
t t| }	|	dkrtjd |rvtjt	 tj
|prtj tt| dddg dgS |dk	rtjtd	d
j| dd |D }ng }|dk	rtjtdd
j| dd |D }ng }|	|| krtjtd|||	 d}|	}|dkr:|	| |k r:t|	| }t| tk }
t| tk}|
rltjtdj|	 n|rtjtdj|	 g }|rt| nd}|dk	r|j| tjtd| t }g }g }d}d}d}t }t| \}}|dk	r|j| tjtdt|| |jd d|kr.|jd xv|t D ]h}|rT||krTq:|rh||krhq:||krvq:|j| d}||k}|ot|}|d<kr| rtjtd| q:yt|}W n, t t!fk
r   tjtd| w:Y nX yr|rB|dkrBt"|dkr&| dtd n| t|td |d n&t"|dkrR| n| t|d |d}W nV t#t$fk
r } z4t |t$stjtd|t"| |j| w:W Y dd}~X nX d}x |D ]}t%||rd}P qW |rtjtd|| q:t&|sdnt||	t|	| }|o>|dk	o>t||	k } | rTtjtd| tt|d  }!t'|!d!}!d}"d}#g }$g }%x|D ]}&|&| |	d" krq| |&|&|  }'|r|dkr||' }'y|'j(||rd#nd$d%}(W nB t#k
r( } z$tjtd&|t"| |!}"d}#P W Y dd}~X nX |r|&dkr| |& d'krt)|d(})|r|(d|) |krxdt&|&|&d  d=D ]P}*| |*|&|  }'|r|dkr||' }'|'j(|d#d%}(|(d|) |kr|P q|W |$j|( |%jt*|(| |%d> |kr |"d7 }"|"|!ks|r|dkrP qW |# r|r| ry| td)d j(|d$d% W nF t#k
r } z(tjtd*|t"| |j| w:W Y dd}~X nX |%rt+|%t|% nd}+|+|ks|"|!krF|j| tjtd+||"t,|+d, d-d. |dd|gkr:|# r:t| ||dg |},||kr.|,}n|dkr>|,}n|,}q:tjtd/|t,|+d, d-d. |srt-|}-nt.|}-|-rtjtd0j|t"|- g }.|dkrx4|$D ],}(t/|(d1|-rd2j|-nd}/|.j|/ qW t0|.}0|0rtjtd3j|0| |jt| ||+||0| ||ddgkrd|+d1k rdtjd4| |rVtjt	 tj
| t|| gS ||kr:tjd5| |rtjt	 tj
| t|| gS q:W t|dk	rX|s|s|rtjtd6 |rtjd7|j1 |j| nd|	r|dk	s(|	r|	r|j2|j2k	s(|dk		r>tjd8 |j| n|	rXtjd9 |j| |	r|tjd:|j3 j1t|d  n
tjd; |	rtjt	 tj
| |S )?ae  
    Given a raw bytes sequence, return the best possibles charset usable to render str objects.
    If there is no results, it is a strong indicator that the source is binary/not text.
    By default, the process will extract 5 blocs of 512o each to assess the mess and coherence of a given sequence.
    And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will.

    The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page
    but never take it for granted. Can improve the performance.

    You may want to focus your attention to some code page or/and not others, use cp_isolation and cp_exclusion for that
    purpose.

    This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
    By default the library does not setup any handler other than the NullHandler, if you choose to set the 'explain'
    toggle to True it will alter the logger configuration to add a StreamHandler that is suitable for debugging.
    Custom logging format and handler can be set manually.
    z4Expected object of type bytes or bytearray, got: {0}r   z<Encoding detection on empty bytes, assuming utf_8 intention.utf_8g        F Nz`cp_isolation is set. use this flag for debugging purpose. limited list of encoding allowed : %s.z, c             S   s   g | ]}t |d qS )F)r   ).0cp r+   Z/var/www/html/StaffProfile/staffvenv/lib/python3.6/site-packages/charset_normalizer/api.py
<listcomp>]   s    zfrom_bytes.<locals>.<listcomp>zacp_exclusion is set. use this flag for debugging purpose. limited list of encoding excluded : %s.c             S   s   g | ]}t |d qS )F)r   )r)   r*   r+   r+   r,   r-   h   s    z^override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.r	   z>Trying to detect encoding from a tiny portion of ({}) byte(s).zIUsing lazy str decoding because the payload is quite large, ({}) byte(s).z@Detected declarative mark in sequence. Priority +1 given for %s.zIDetected a SIG or BOM mark on first %i byte(s). Priority +1 given for %s.asciiutf_16utf_32z[Encoding %s wont be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.z2Encoding %s does not provide an IncrementalDecoderg    A)encodingz9Code page %s does not fit given bytes sequence at ALL. %sTzW%s is deemed too similar to code page %s and was consider unsuited already. Continuing!zpCode page %s is a multi byte encoding table and it appear that at least one character was encoded using n-bytes.         ignorestrict)errorszaLazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s      g     j@z^LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %szc%s was excluded because of initial chaos probing. Gave up %i time(s). Computed mean chaos is %f %%.d      )ndigitsz=%s passed initial chaos probing. Mean measured chaos is %f %%z&{} should target any language(s) of {}g?,z We detected language {} using {}z.Encoding detection: %s is most likely the one.zoEncoding detection: %s is most likely the one as we detected a BOM or SIG within the beginning of the sequence.zONothing got out of the detection process. Using ASCII/UTF-8/Specified fallback.z7Encoding detection: %s will be used as a fallback matchz:Encoding detection: utf_8 will be used as a fallback matchz:Encoding detection: ascii will be used as a fallback matchz]Encoding detection: Found %s as plausible (best-candidate) for content. With %i alternatives.z=Encoding detection: Unable to determine any suitable charset.>   r/   r0   r>   )4
isinstance	bytearraybytes	TypeErrorformattypeloggerlevel
addHandlerexplain_handlersetLevelr   lendebugremoveHandlerloggingWARNINGr   r   logjoinintr   r   r   appendsetr   r   addr   r   ModuleNotFoundErrorImportErrorstrUnicodeDecodeErrorLookupErrorr   rangemaxdecodeminr   sumroundr   r   r
   r   r1   fingerprintbest)1r   r   r    r!   r"   r#   r$   r%   Zprevious_logger_levellengthZis_too_small_sequenceZis_too_large_sequenceZprioritized_encodingsZspecified_encodingZtestedZtested_but_hard_failureZtested_but_soft_failureZfallback_asciiZfallback_u8Zfallback_specifiedresultsZsig_encodingZsig_payloadZencoding_ianaZdecoded_payloadZbom_or_sig_availableZstrip_sig_or_bomZis_multi_byte_decodereZsimilar_soft_failure_testZencoding_soft_failedZr_Zmulti_byte_bonusZmax_chunk_gave_upZearly_stop_countZlazy_str_hard_failureZ	md_chunksZ	md_ratiosiZcut_sequencechunkZchunk_partial_size_chkjZmean_mess_ratioZfallback_entryZtarget_languagesZ	cd_ratiosZchunk_languagesZcd_ratios_mergedr+   r+   r,   
from_bytes%   sZ   














































rh   )	fpr   r    r!   r"   r#   r$   r%   r&   c          	   C   s   t | j |||||||S )z
    Same thing than the function from_bytes but using a file pointer that is already ready.
    Will not close the file pointer.
    )rh   read)ri   r   r    r!   r"   r#   r$   r%   r+   r+   r,   from_fp  s    rk   )	pathr   r    r!   r"   r#   r$   r%   r&   c       	      C   s,   t | d}t||||||||S Q R X dS )z
    Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
    Can raise IOError.
    rbN)openrk   )	rl   r   r    r!   r"   r#   r$   r%   ri   r+   r+   r,   	from_path  s    ro   )rl   r   r    r!   r"   r#   r$   r&   c          	   C   s   t | ||||||}t| }tt|}	t|dkrBtdj||j }
|	d  d|
j 7  < t	djt
| j|dj|	d}|j|
j  W dQ R X |
S )zi
    Take a (text-based) file path and try to create another file next to it, this time using UTF-8.
    r   z;Unable to normalize "{}", no encoding charset seems to fit.-z{}r(   wbN)ro   r   listr   rJ   IOErrorrC   ra   r1   rn   rW   replacerP   writeoutput)rl   r   r    r!   r"   r#   r$   rc   filenameZtarget_extensionsresultri   r+   r+   r,   	normalize7  s*     ry   )r   r   r   NNTF)r   r   r   NNTF)r   r   r   NNTF)r   r   r   NNT)1rM   os.pathr   r   typingr   r   r   r   osr   rV   rW   Zcdr
   r   r   r   Zconstantr   r   r   r   Zmdr   modelsr   r   utilsr   r   r   r   r   r   	getLoggerrE   StreamHandlerrH   setFormatter	FormatterrA   rQ   floatboolrh   rk   ro   ry   r+   r+   r+   r,   <module>   sb   
 
         Y                 