panwang
/
exporter-ex


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301
							try:
    import unicodedata2 as unicodedata
except ImportError:
    import unicodedata

from codecs import IncrementalDecoder
from re import findall
from typing import Optional, Tuple, Union, List, Set
import importlib
from _multibytecodec import MultibyteIncrementalDecoder  # type: ignore

from encodings.aliases import aliases
from functools import lru_cache

from charset_normalizer.constant import UNICODE_RANGES_COMBINED, UNICODE_SECONDARY_RANGE_KEYWORD, \
    RE_POSSIBLE_ENCODING_INDICATION, ENCODING_MARKS, UTF8_MAXIMAL_ALLOCATION, IANA_SUPPORTED_SIMILAR


@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_accentuated(character: str) -> bool:
    try:
        description = unicodedata.name(character)  # type: str
    except ValueError:
        return False
    return "WITH GRAVE" in description or "WITH ACUTE" in description or "WITH CEDILLA" in description or "WITH DIAERESIS" in description or "WITH CIRCUMFLEX" in description


@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def remove_accent(character: str) -> str:
    decomposed = unicodedata.decomposition(character)  # type: str
    if not decomposed:
        return character

    codes = decomposed.split(" ")  # type: List[str]

    return chr(
        int(
            codes[0],
            16
        )
    )


@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def unicode_range(character: str) -> Optional[str]:
    """
    Retrieve the Unicode range official name from a single character.
    """
    character_ord = ord(character)  # type: int

    for range_name, ord_range in UNICODE_RANGES_COMBINED.items():
        if character_ord in ord_range:
            return range_name

    return None


@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_latin(character: str) -> bool:
    try:
        description = unicodedata.name(character)  # type: str
    except ValueError:
        return False
    return "LATIN" in description


def is_ascii(character: str) -> bool:
    try:
        character.encode("ascii")
    except UnicodeEncodeError:
        return False
    return True

@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_punctuation(character: str) -> bool:
    character_category = unicodedata.category(character)  # type: str

    if "P" in character_category:
        return True

    character_range = unicode_range(character)  # type: Optional[str]

    if character_range is None:
        return False

    return "Punctuation" in character_range


@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_symbol(character: str) -> bool:
    character_category = unicodedata.category(character)  # type: str

    if "S" in character_category or "N" in character_category:
        return True

    character_range = unicode_range(character)  # type: Optional[str]

    if character_range is None:
        return False

    return "Forms" in character_range


@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_separator(character: str) -> bool:
    if character.isspace() or character in ["｜", "+", ",", ";", "<", ">"]:
        return True

    character_category = unicodedata.category(character)  # type: str

    return "Z" in character_category


@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_case_variable(character: str) -> bool:
    return character.islower() != character.isupper()


def is_private_use_only(character: str) -> bool:
    character_category = unicodedata.category(character)  # type: str

    return "Co" == character_category


@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_cjk(character: str) -> bool:
    try:
        character_name = unicodedata.name(character)
    except ValueError:
        return False

    return "CJK" in character_name


@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_hiragana(character: str) -> bool:
    try:
        character_name = unicodedata.name(character)
    except ValueError:
        return False

    return "HIRAGANA" in character_name


@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_katakana(character: str) -> bool:
    try:
        character_name = unicodedata.name(character)
    except ValueError:
        return False

    return "KATAKANA" in character_name


@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_hangul(character: str) -> bool:
    try:
        character_name = unicodedata.name(character)
    except ValueError:
        return False

    return "HANGUL" in character_name


@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_thai(character: str) -> bool:
    try:
        character_name = unicodedata.name(character)
    except ValueError:
        return False

    return "THAI" in character_name


@lru_cache(maxsize=len(UNICODE_RANGES_COMBINED))
def is_unicode_range_secondary(range_name: str) -> bool:
    for keyword in UNICODE_SECONDARY_RANGE_KEYWORD:
        if keyword in range_name:
            return True

    return False


def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional[str]:
    """
    Extract using ASCII-only decoder any specified encoding in the first n-bytes.
    """
    if not isinstance(sequence, bytes):
        raise TypeError

    seq_len = len(sequence)  # type: int

    results = findall(
        RE_POSSIBLE_ENCODING_INDICATION,
        sequence[:seq_len if seq_len <= search_zone else search_zone].decode('ascii', errors='ignore')
    )  # type: List[str]

    if len(results) == 0:
        return None

    for specified_encoding in results:
        specified_encoding = specified_encoding.lower().replace('-', '_')

        for encoding_alias, encoding_iana in aliases.items():
            if encoding_alias == specified_encoding:
                return encoding_iana
            if encoding_iana == specified_encoding:
                return encoding_iana

    return None


@lru_cache(maxsize=128)
def is_multi_byte_encoding(name: str) -> bool:
    """
    Verify is a specific encoding is a multi byte one based on it IANA name
    """
    return name in {"utf_8", "utf_8_sig", "utf_16", "utf_16_be", "utf_16_le", "utf_32", "utf_32_le", "utf_32_be", "utf_7"} or issubclass(
        importlib.import_module('encodings.{}'.format(name)).IncrementalDecoder,  # type: ignore
        MultibyteIncrementalDecoder
    )


def identify_sig_or_bom(sequence: bytes) -> Tuple[Optional[str], bytes]:
    """
    Identify and extract SIG/BOM in given sequence.
    """

    for iana_encoding in ENCODING_MARKS:
        marks = ENCODING_MARKS[iana_encoding]  # type: Union[bytes, List[bytes]]

        if isinstance(marks, bytes):
            marks = [marks]

        for mark in marks:
            if sequence.startswith(mark):
                return iana_encoding, mark

    return None, b""


def should_strip_sig_or_bom(iana_encoding: str) -> bool:
    return iana_encoding not in {"utf_16", "utf_32"}


def iana_name(cp_name: str, strict: bool = True) -> str:
    cp_name = cp_name.lower().replace('-', '_')

    for encoding_alias, encoding_iana in aliases.items():
        if cp_name == encoding_alias or cp_name == encoding_iana:
            return encoding_iana

    if strict:
        raise ValueError("Unable to retrieve IANA for '{}'".format(cp_name))

    return cp_name


def range_scan(decoded_sequence: str) -> List[str]:
    ranges = set()  # type: Set[str]

    for character in decoded_sequence:
        character_range = unicode_range(character)  # type: Optional[str]

        if character_range is None:
            continue

        ranges.add(
            character_range
        )

    return list(ranges)


def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:

    if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b):
        return 0.

    decoder_a = importlib.import_module('encodings.{}'.format(iana_name_a)).IncrementalDecoder  # type: ignore
    decoder_b = importlib.import_module('encodings.{}'.format(iana_name_b)).IncrementalDecoder  # type: ignore

    id_a = decoder_a(errors="ignore")  # type: IncrementalDecoder
    id_b = decoder_b(errors="ignore")  # type: IncrementalDecoder

    character_match_count = 0  # type: int

    for i in range(0, 255):
        to_be_decoded = bytes([i])  # type: bytes
        if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded):
            character_match_count += 1

    return character_match_count / 254


def is_cp_similar(iana_name_a: str, iana_name_b: str) -> bool:
    """
    Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using
    the function cp_similarity.
    """
    return iana_name_a in IANA_SUPPORTED_SIMILAR and iana_name_b in IANA_SUPPORTED_SIMILAR[iana_name_a]