3
e3                 @   s   d dl Z d dlmZ d dlmZ d dlmZ d dlmZ d dl	m
Z
 d dlmZmZmZmZmZmZmZ dd	lmZmZ dd
lmZ ddlmZmZmZ G dd dZG dd dZeeef Z ee  Z!G dd dZ"dS )    N)Counter)aliases)sha256)dumps)sub)AnyDictIteratorListOptionalTupleUnion   )NOT_PRINTABLE_PATTERNTOO_BIG_SEQUENCE)
mess_ratio)	iana_nameis_multi_byte_encodingunicode_rangec               @   s   e Zd ZdBeeeedee dddZe	edddZ
e	edd	d
ZeedddZeedddZeedddZeedddZedddZedddZd ddddZeedddZeee dddZeedddZeedd d!Zeee dd"d#Zeedd$d%Zeedd&d'Zeedd(d)Zeedd*d+Zeedd,d-Zeedd.d/Z eed  dd0d1Z!eedd2d3Z"eee dd4d5Z#eee dd6d7Z$d dd8d9Z%d dd:d;Z&dCeed=d>d?Z'eedd@dAZ(dS )DCharsetMatchNCoherenceMatches)payloadguessed_encodingmean_mess_ratiohas_sig_or_bom	languagesdecoded_payloadc             C   sF   || _ || _|| _|| _|| _d | _g | _d| _d | _d | _	|| _
d S )Ng        )_payload	_encoding_mean_mess_ratio
_languages_has_sig_or_bom_unicode_ranges_leavesZ_mean_coherence_ratio_output_payload_output_encoding_string)selfr   r   r   r   r   r    r(   ]/var/www/html/StaffProfile/staffvenv/lib/python3.6/site-packages/charset_normalizer/models.py__init__   s    	zCharsetMatch.__init__)otherreturnc             C   s>   t |ts&tdjt|jt| j| j|jko<| j|jkS )Nz&__eq__ cannot be invoked on {} and {}.)
isinstancer   	TypeErrorformatstr	__class__encodingfingerprint)r'   r+   r(   r(   r)   __eq__(   s
    
zCharsetMatch.__eq__c             C   sv   t |tstt| j|j }t| j|j }|dk rj|dkrj|dkr^| j|jkr^| j|jkS | j|jkS | j|jk S )zQ
        Implemented to make sorted available upon CharsetMatches items.
        g{Gz?g{Gz?g        )r-   r   
ValueErrorabschaos	coherencemulti_byte_usage)r'   r+   Zchaos_differenceZcoherence_differencer(   r(   r)   __lt__1   s    
zCharsetMatch.__lt__)r,   c             C   s   dt t| t | j  S )Ng      ?)lenr0   raw)r'   r(   r(   r)   r9   D   s    zCharsetMatch.multi_byte_usagec             C   s   t jdt tt| dS )z
        Check once again chaos in decoded text, except this time, with full content.
        Use with caution, this can be very slow.
        Notice: Will be removed in 3.0
        z=chaos_secondary_pass is deprecated and will be removed in 3.0g      ?)warningswarnDeprecationWarningr   r0   )r'   r(   r(   r)   chaos_secondary_passH   s    z!CharsetMatch.chaos_secondary_passc             C   s   t jdt dS )zy
        Coherence ratio on the first non-latin language detected if ANY.
        Notice: Will be removed in 3.0
        z<coherence_non_latin is deprecated and will be removed in 3.0g        )r=   r>   r?   )r'   r(   r(   r)   coherence_non_latinU   s    z CharsetMatch.coherence_non_latinc             C   s,   t jdt ttdt| j }t|j S )z_
        Word counter instance on decoded text.
        Notice: Will be removed in 3.0
        z2w_counter is deprecated and will be removed in 3.0 )	r=   r>   r?   r   r   r0   lowerr   split)r'   Zstring_printable_onlyr(   r(   r)   	w_countera   s    zCharsetMatch.w_counterc             C   s"   | j d krt| j| jd| _ | j S )Nstrict)r&   r0   r   r   )r'   r(   r(   r)   __str__o   s    
zCharsetMatch.__str__c             C   s   dj | j| jS )Nz<CharsetMatch '{}' bytes({})>)r/   r2   r3   )r'   r(   r(   r)   __repr__u   s    zCharsetMatch.__repr__c             C   s:   t |t s|| kr$tdj|jd |_| jj| d S )Nz;Unable to add instance <{}> as a submatch of a CharsetMatch)r-   r   r5   r/   r1   r&   r#   append)r'   r+   r(   r(   r)   add_submatchx   s    
zCharsetMatch.add_submatchc             C   s   | j S )N)r   )r'   r(   r(   r)   r2      s    zCharsetMatch.encodingc             C   sH   g }x>t j D ]2\}}| j|kr,|j| q| j|kr|j| qW |S )z
        Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
        )r   itemsr2   rI   )r'   Zalso_known_asupr(   r(   r)   encoding_aliases   s    

zCharsetMatch.encoding_aliasesc             C   s   | j S )N)r!   )r'   r(   r(   r)   bom   s    zCharsetMatch.bomc             C   s   | j S )N)r!   )r'   r(   r(   r)   byte_order_mark   s    zCharsetMatch.byte_order_markc             C   s   dd | j D S )z
        Return the complete list of possible languages found in decoded sequence.
        Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.
        c             S   s   g | ]}|d  qS )r   r(   ).0er(   r(   r)   
<listcomp>   s    z*CharsetMatch.languages.<locals>.<listcomp>)r    )r'   r(   r(   r)   r      s    zCharsetMatch.languagesc             C   sp   | j sbd| jkrdS ddlm}m} t| jr8|| jn|| j}t|dksVd|krZdS |d S | j d d S )z
        Most probable language found in decoded sequence. If none were detected or inferred, the property will return
        "Unknown".
        asciiZEnglishr   )encoding_languagesmb_encoding_languageszLatin BasedUnknown)r    could_be_from_charsetZcharset_normalizer.cdrU   rV   r   r2   r;   )r'   rU   rV   r   r(   r(   r)   language   s    

zCharsetMatch.languagec             C   s   | j S )N)r   )r'   r(   r(   r)   r7      s    zCharsetMatch.chaosc             C   s   | j s
dS | j d d S )Ng        r   r   )r    )r'   r(   r(   r)   r8      s    zCharsetMatch.coherencec             C   s   t | jd ddS )Nd      )ndigits)roundr7   )r'   r(   r(   r)   percent_chaos   s    zCharsetMatch.percent_chaosc             C   s   t | jd ddS )NrZ   r[   )r\   )r]   r8   )r'   r(   r(   r)   percent_coherence   s    zCharsetMatch.percent_coherencec             C   s   | j S )z+
        Original untouched bytes.
        )r   )r'   r(   r(   r)   r<      s    zCharsetMatch.rawc             C   s   | j S )N)r#   )r'   r(   r(   r)   submatch   s    zCharsetMatch.submatchc             C   s   t | jdkS )Nr   )r;   r#   )r'   r(   r(   r)   has_submatch   s    zCharsetMatch.has_submatchc             C   s@   | j d k	r| j S dd t| D }ttdd |D | _ | j S )Nc             S   s   g | ]}t |qS r(   )r   )rQ   charr(   r(   r)   rS      s    z*CharsetMatch.alphabets.<locals>.<listcomp>c             S   s   h | ]}|r|qS r(   r(   )rQ   rr(   r(   r)   	<setcomp>   s    z)CharsetMatch.alphabets.<locals>.<setcomp>)r"   r0   sortedlist)r'   Zdetected_rangesr(   r(   r)   	alphabets   s
    
zCharsetMatch.alphabetsc             C   s   | j gdd | jD  S )z
        The complete list of encoding that output the exact SAME str result and therefore could be the originating
        encoding.
        This list does include the encoding available in property 'encoding'.
        c             S   s   g | ]
}|j qS r(   )r2   )rQ   mr(   r(   r)   rS      s    z6CharsetMatch.could_be_from_charset.<locals>.<listcomp>)r   r#   )r'   r(   r(   r)   rX      s    z"CharsetMatch.could_be_from_charsetc             C   s   | S )z>
        Kept for BC reasons. Will be removed in 3.0.
        r(   )r'   r(   r(   r)   first   s    zCharsetMatch.firstc             C   s   | S )z>
        Kept for BC reasons. Will be removed in 3.0.
        r(   )r'   r(   r(   r)   best   s    zCharsetMatch.bestutf_8)r2   r,   c             C   s2   | j dks| j |kr,|| _ t| j|d| _| jS )z
        Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.
        Any errors will be simply ignored by the encoder NOT replaced.
        Nreplace)r%   r0   encoder$   )r'   r2   r(   r(   r)   output  s    zCharsetMatch.outputc             C   s   t | j j S )zw
        Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one.
        )r   rn   	hexdigest)r'   r(   r(   r)   r3     s    zCharsetMatch.fingerprint)N)rk   ))__name__
__module____qualname__bytesr0   floatboolr   r*   objectr4   r:   propertyr9   r@   rA   r   rE   rG   rH   rJ   r2   r
   rN   rO   rP   r   rY   r7   r8   r^   r_   r<   r`   ra   rg   rX   ri   rj   rn   r3   r(   r(   r(   r)   r      sb   	r   c               @   s   e Zd ZdZdee dddZee dddZe	e
ef ed	d
dZe
dddZedddZedd	ddZed dddZed dddZdS )CharsetMatchesz
    Container with every CharsetMatch items ordered by default from most probable to the less one.
    Act like a list(iterable) but does not implements all related methods.
    N)resultsc             C   s   |rt |ng | _d S )N)re   _results)r'   ry   r(   r(   r)   r*     s    zCharsetMatches.__init__)r,   c             c   s   | j E d H  d S )N)rz   )r'   r(   r(   r)   __iter__  s    zCharsetMatches.__iter__)itemr,   c             C   sN   t |tr| j| S t |trFt|d}x| jD ]}||jkr0|S q0W tdS )z
        Retrieve a single item either by its position or encoding name (alias may be used here).
        Raise KeyError upon invalid index or encoding not present in results.
        FN)r-   intrz   r0   r   rX   KeyError)r'   r|   resultr(   r(   r)   __getitem__!  s    




zCharsetMatches.__getitem__c             C   s
   t | jS )N)r;   rz   )r'   r(   r(   r)   __len__/  s    zCharsetMatches.__len__c             C   s   t | jdkS )Nr   )r;   rz   )r'   r(   r(   r)   __bool__2  s    zCharsetMatches.__bool__c             C   s~   t |tstdjt|jt|jtkrbx4| j	D ]*}|j
|j
kr4|j|jkr4|j| dS q4W | j	j| t| j	| _	dS )z~
        Insert a single match. Will be inserted accordingly to preserve sort.
        Can be inserted as a submatch.
        z-Cannot append instance '{}' to CharsetMatchesN)r-   r   r5   r/   r0   r1   r;   r<   r   rz   r3   r7   rJ   rI   re   )r'   r|   matchr(   r(   r)   rI   5  s    

zCharsetMatches.appendr   c             C   s   | j s
dS | j d S )zQ
        Simply return the first match. Strict equivalent to matches[0].
        Nr   )rz   )r'   r(   r(   r)   rj   I  s    zCharsetMatches.bestc             C   s   | j  S )zP
        Redundant method, call the method best(). Kept for BC reasons.
        )rj   )r'   r(   r(   r)   ri   Q  s    zCharsetMatches.first)N)rp   rq   rr   __doc__r
   r   r*   r	   r{   r   r}   r0   r   r   ru   r   rI   r   rj   ri   r(   r(   r(   r)   rx     s   rx   c               @   sj   e Zd Zeee ee ee eee eeeee edddZe	e
eef dddZedddZd	S )
CliDetectionResult)pathr2   rN   alternative_encodingsrY   rg   r   r7   r8   unicode_pathis_preferredc             C   sF   || _ |
| _|| _|| _|| _|| _|| _|| _|| _|	| _	|| _
d S )N)r   r   r2   rN   r   rY   rg   r   r7   r8   r   )r'   r   r2   rN   r   rY   rg   r   r7   r8   r   r   r(   r(   r)   r*   ]  s    zCliDetectionResult.__init__)r,   c             C   s2   | j | j| j| j| j| j| j| j| j| j	| j
dS )N)r   r2   rN   r   rY   rg   r   r7   r8   r   r   )r   r2   rN   r   rY   rg   r   r7   r8   r   r   )r'   r(   r(   r)   __dict__w  s    zCliDetectionResult.__dict__c             C   s   t | jdddS )NT   )ensure_asciiindent)r   r   )r'   r(   r(   r)   to_json  s    zCliDetectionResult.to_jsonN)rp   rq   rr   r0   r   r
   ru   rt   r*   rw   r   r   r   r   r(   r(   r(   r)   r   \  s   r   )#r=   collectionsr   Zencodings.aliasesr   hashlibr   jsonr   rer   typingr   r   r	   r
   r   r   r   Zconstantr   r   Zmdr   utilsr   r   r   r   rx   r0   rt   ZCoherenceMatchr   r   r(   r(   r(   r)   <module>   s    $  	C