utils.py 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301
  1. try:
  2. import unicodedata2 as unicodedata
  3. except ImportError:
  4. import unicodedata
  5. from codecs import IncrementalDecoder
  6. from re import findall
  7. from typing import Optional, Tuple, Union, List, Set
  8. import importlib
  9. from _multibytecodec import MultibyteIncrementalDecoder # type: ignore
  10. from encodings.aliases import aliases
  11. from functools import lru_cache
  12. from charset_normalizer.constant import UNICODE_RANGES_COMBINED, UNICODE_SECONDARY_RANGE_KEYWORD, \
  13. RE_POSSIBLE_ENCODING_INDICATION, ENCODING_MARKS, UTF8_MAXIMAL_ALLOCATION, IANA_SUPPORTED_SIMILAR
  14. @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
  15. def is_accentuated(character: str) -> bool:
  16. try:
  17. description = unicodedata.name(character) # type: str
  18. except ValueError:
  19. return False
  20. return "WITH GRAVE" in description or "WITH ACUTE" in description or "WITH CEDILLA" in description or "WITH DIAERESIS" in description or "WITH CIRCUMFLEX" in description
  21. @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
  22. def remove_accent(character: str) -> str:
  23. decomposed = unicodedata.decomposition(character) # type: str
  24. if not decomposed:
  25. return character
  26. codes = decomposed.split(" ") # type: List[str]
  27. return chr(
  28. int(
  29. codes[0],
  30. 16
  31. )
  32. )
  33. @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
  34. def unicode_range(character: str) -> Optional[str]:
  35. """
  36. Retrieve the Unicode range official name from a single character.
  37. """
  38. character_ord = ord(character) # type: int
  39. for range_name, ord_range in UNICODE_RANGES_COMBINED.items():
  40. if character_ord in ord_range:
  41. return range_name
  42. return None
  43. @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
  44. def is_latin(character: str) -> bool:
  45. try:
  46. description = unicodedata.name(character) # type: str
  47. except ValueError:
  48. return False
  49. return "LATIN" in description
  50. def is_ascii(character: str) -> bool:
  51. try:
  52. character.encode("ascii")
  53. except UnicodeEncodeError:
  54. return False
  55. return True
  56. @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
  57. def is_punctuation(character: str) -> bool:
  58. character_category = unicodedata.category(character) # type: str
  59. if "P" in character_category:
  60. return True
  61. character_range = unicode_range(character) # type: Optional[str]
  62. if character_range is None:
  63. return False
  64. return "Punctuation" in character_range
  65. @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
  66. def is_symbol(character: str) -> bool:
  67. character_category = unicodedata.category(character) # type: str
  68. if "S" in character_category or "N" in character_category:
  69. return True
  70. character_range = unicode_range(character) # type: Optional[str]
  71. if character_range is None:
  72. return False
  73. return "Forms" in character_range
  74. @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
  75. def is_separator(character: str) -> bool:
  76. if character.isspace() or character in ["|", "+", ",", ";", "<", ">"]:
  77. return True
  78. character_category = unicodedata.category(character) # type: str
  79. return "Z" in character_category
  80. @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
  81. def is_case_variable(character: str) -> bool:
  82. return character.islower() != character.isupper()
  83. def is_private_use_only(character: str) -> bool:
  84. character_category = unicodedata.category(character) # type: str
  85. return "Co" == character_category
  86. @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
  87. def is_cjk(character: str) -> bool:
  88. try:
  89. character_name = unicodedata.name(character)
  90. except ValueError:
  91. return False
  92. return "CJK" in character_name
  93. @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
  94. def is_hiragana(character: str) -> bool:
  95. try:
  96. character_name = unicodedata.name(character)
  97. except ValueError:
  98. return False
  99. return "HIRAGANA" in character_name
  100. @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
  101. def is_katakana(character: str) -> bool:
  102. try:
  103. character_name = unicodedata.name(character)
  104. except ValueError:
  105. return False
  106. return "KATAKANA" in character_name
  107. @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
  108. def is_hangul(character: str) -> bool:
  109. try:
  110. character_name = unicodedata.name(character)
  111. except ValueError:
  112. return False
  113. return "HANGUL" in character_name
  114. @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
  115. def is_thai(character: str) -> bool:
  116. try:
  117. character_name = unicodedata.name(character)
  118. except ValueError:
  119. return False
  120. return "THAI" in character_name
  121. @lru_cache(maxsize=len(UNICODE_RANGES_COMBINED))
  122. def is_unicode_range_secondary(range_name: str) -> bool:
  123. for keyword in UNICODE_SECONDARY_RANGE_KEYWORD:
  124. if keyword in range_name:
  125. return True
  126. return False
  127. def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional[str]:
  128. """
  129. Extract using ASCII-only decoder any specified encoding in the first n-bytes.
  130. """
  131. if not isinstance(sequence, bytes):
  132. raise TypeError
  133. seq_len = len(sequence) # type: int
  134. results = findall(
  135. RE_POSSIBLE_ENCODING_INDICATION,
  136. sequence[:seq_len if seq_len <= search_zone else search_zone].decode('ascii', errors='ignore')
  137. ) # type: List[str]
  138. if len(results) == 0:
  139. return None
  140. for specified_encoding in results:
  141. specified_encoding = specified_encoding.lower().replace('-', '_')
  142. for encoding_alias, encoding_iana in aliases.items():
  143. if encoding_alias == specified_encoding:
  144. return encoding_iana
  145. if encoding_iana == specified_encoding:
  146. return encoding_iana
  147. return None
  148. @lru_cache(maxsize=128)
  149. def is_multi_byte_encoding(name: str) -> bool:
  150. """
  151. Verify is a specific encoding is a multi byte one based on it IANA name
  152. """
  153. return name in {"utf_8", "utf_8_sig", "utf_16", "utf_16_be", "utf_16_le", "utf_32", "utf_32_le", "utf_32_be", "utf_7"} or issubclass(
  154. importlib.import_module('encodings.{}'.format(name)).IncrementalDecoder, # type: ignore
  155. MultibyteIncrementalDecoder
  156. )
  157. def identify_sig_or_bom(sequence: bytes) -> Tuple[Optional[str], bytes]:
  158. """
  159. Identify and extract SIG/BOM in given sequence.
  160. """
  161. for iana_encoding in ENCODING_MARKS:
  162. marks = ENCODING_MARKS[iana_encoding] # type: Union[bytes, List[bytes]]
  163. if isinstance(marks, bytes):
  164. marks = [marks]
  165. for mark in marks:
  166. if sequence.startswith(mark):
  167. return iana_encoding, mark
  168. return None, b""
  169. def should_strip_sig_or_bom(iana_encoding: str) -> bool:
  170. return iana_encoding not in {"utf_16", "utf_32"}
  171. def iana_name(cp_name: str, strict: bool = True) -> str:
  172. cp_name = cp_name.lower().replace('-', '_')
  173. for encoding_alias, encoding_iana in aliases.items():
  174. if cp_name == encoding_alias or cp_name == encoding_iana:
  175. return encoding_iana
  176. if strict:
  177. raise ValueError("Unable to retrieve IANA for '{}'".format(cp_name))
  178. return cp_name
  179. def range_scan(decoded_sequence: str) -> List[str]:
  180. ranges = set() # type: Set[str]
  181. for character in decoded_sequence:
  182. character_range = unicode_range(character) # type: Optional[str]
  183. if character_range is None:
  184. continue
  185. ranges.add(
  186. character_range
  187. )
  188. return list(ranges)
  189. def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:
  190. if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b):
  191. return 0.
  192. decoder_a = importlib.import_module('encodings.{}'.format(iana_name_a)).IncrementalDecoder # type: ignore
  193. decoder_b = importlib.import_module('encodings.{}'.format(iana_name_b)).IncrementalDecoder # type: ignore
  194. id_a = decoder_a(errors="ignore") # type: IncrementalDecoder
  195. id_b = decoder_b(errors="ignore") # type: IncrementalDecoder
  196. character_match_count = 0 # type: int
  197. for i in range(0, 255):
  198. to_be_decoded = bytes([i]) # type: bytes
  199. if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded):
  200. character_match_count += 1
  201. return character_match_count / 254
  202. def is_cp_similar(iana_name_a: str, iana_name_b: str) -> bool:
  203. """
  204. Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using
  205. the function cp_similarity.
  206. """
  207. return iana_name_a in IANA_SUPPORTED_SIMILAR and iana_name_b in IANA_SUPPORTED_SIMILAR[iana_name_a]