md.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480
  1. from functools import lru_cache
  2. from typing import Optional, List
  3. from charset_normalizer.constant import UNICODE_SECONDARY_RANGE_KEYWORD
  4. from charset_normalizer.utils import is_punctuation, is_symbol, unicode_range, is_accentuated, is_latin, \
  5. remove_accent, is_separator, is_cjk, is_case_variable, is_hangul, is_katakana, is_hiragana, is_ascii, is_thai
  6. class MessDetectorPlugin:
  7. """
  8. Base abstract class used for mess detection plugins.
  9. All detectors MUST extend and implement given methods.
  10. """
  11. def eligible(self, character: str) -> bool:
  12. """
  13. Determine if given character should be fed in.
  14. """
  15. raise NotImplementedError # pragma: nocover
  16. def feed(self, character: str) -> None:
  17. """
  18. The main routine to be executed upon character.
  19. Insert the logic in witch the text would be considered chaotic.
  20. """
  21. raise NotImplementedError # pragma: nocover
  22. def reset(self) -> None:
  23. """
  24. Permit to reset the plugin to the initial state.
  25. """
  26. raise NotImplementedError # pragma: nocover
  27. @property
  28. def ratio(self) -> float:
  29. """
  30. Compute the chaos ratio based on what your feed() has seen.
  31. Must NOT be lower than 0.; No restriction gt 0.
  32. """
  33. raise NotImplementedError # pragma: nocover
  34. class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
  35. def __init__(self):
  36. self._punctuation_count = 0 # type: int
  37. self._symbol_count = 0 # type: int
  38. self._character_count = 0 # type: int
  39. self._last_printable_char = None # type: Optional[str]
  40. self._frenzy_symbol_in_word = False # type: bool
  41. def eligible(self, character: str) -> bool:
  42. return character.isprintable()
  43. def feed(self, character: str) -> None:
  44. self._character_count += 1
  45. if character != self._last_printable_char and character not in ["<", ">", "=", ":", "/", "&", ";", "{", "}", "[", "]", ",", "|", '"']:
  46. if is_punctuation(character):
  47. self._punctuation_count += 1
  48. elif character.isdigit() is False and is_symbol(character):
  49. self._symbol_count += 2
  50. self._last_printable_char = character
  51. def reset(self) -> None:
  52. self._punctuation_count = 0
  53. self._character_count = 0
  54. self._symbol_count = 0
  55. @property
  56. def ratio(self) -> float:
  57. if self._character_count == 0:
  58. return 0.
  59. ratio_of_punctuation = (self._punctuation_count + self._symbol_count) / self._character_count # type: float
  60. return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.
  61. class TooManyAccentuatedPlugin(MessDetectorPlugin):
  62. def __init__(self):
  63. self._character_count = 0 # type: int
  64. self._accentuated_count = 0 # type: int
  65. def eligible(self, character: str) -> bool:
  66. return character.isalpha()
  67. def feed(self, character: str) -> None:
  68. self._character_count += 1
  69. if is_accentuated(character):
  70. self._accentuated_count += 1
  71. def reset(self) -> None:
  72. self._character_count = 0
  73. self._accentuated_count = 0
  74. @property
  75. def ratio(self) -> float:
  76. if self._character_count == 0:
  77. return 0.
  78. ratio_of_accentuation = self._accentuated_count / self._character_count # type: float
  79. return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.
  80. class UnprintablePlugin(MessDetectorPlugin):
  81. def __init__(self):
  82. self._unprintable_count = 0 # type: int
  83. self._character_count = 0 # type: int
  84. def eligible(self, character: str) -> bool:
  85. return True
  86. def feed(self, character: str) -> None:
  87. if character not in {'\n', '\t', '\r', '\v'} and character.isprintable() is False:
  88. self._unprintable_count += 1
  89. self._character_count += 1
  90. def reset(self) -> None:
  91. self._unprintable_count = 0
  92. @property
  93. def ratio(self) -> float:
  94. if self._character_count == 0:
  95. return 0.
  96. return (self._unprintable_count * 8) / self._character_count
  97. class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
  98. def __init__(self):
  99. self._successive_count = 0 # type: int
  100. self._character_count = 0 # type: int
  101. self._last_latin_character = None # type: Optional[str]
  102. def eligible(self, character: str) -> bool:
  103. return character.isalpha() and is_latin(character)
  104. def feed(self, character: str) -> None:
  105. self._character_count += 1
  106. if self._last_latin_character is not None:
  107. if is_accentuated(character) and is_accentuated(self._last_latin_character):
  108. if character.isupper() and self._last_latin_character.isupper():
  109. self._successive_count += 1
  110. # Worse if its the same char duplicated with different accent.
  111. if remove_accent(character) == remove_accent(self._last_latin_character):
  112. self._successive_count += 1
  113. self._last_latin_character = character
  114. def reset(self) -> None:
  115. self._successive_count = 0
  116. self._character_count = 0
  117. self._last_latin_character = None
  118. @property
  119. def ratio(self) -> float:
  120. if self._character_count == 0:
  121. return 0.
  122. return (self._successive_count * 2) / self._character_count
  123. class SuspiciousRange(MessDetectorPlugin):
  124. def __init__(self):
  125. self._suspicious_successive_range_count = 0 # type: int
  126. self._character_count = 0 # type: int
  127. self._last_printable_seen = None # type: Optional[str]
  128. def eligible(self, character: str) -> bool:
  129. return character.isprintable()
  130. def feed(self, character: str) -> None:
  131. self._character_count += 1
  132. if character.isspace() or is_punctuation(character):
  133. self._last_printable_seen = None
  134. return
  135. if self._last_printable_seen is None:
  136. self._last_printable_seen = character
  137. return
  138. unicode_range_a = unicode_range(self._last_printable_seen) # type: Optional[str]
  139. unicode_range_b = unicode_range(character) # type: Optional[str]
  140. if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
  141. self._suspicious_successive_range_count += 1
  142. self._last_printable_seen = character
  143. def reset(self) -> None:
  144. self._character_count = 0
  145. self._suspicious_successive_range_count = 0
  146. self._last_printable_seen = None
  147. @property
  148. def ratio(self) -> float:
  149. if self._character_count == 0:
  150. return 0.
  151. ratio_of_suspicious_range_usage = (self._suspicious_successive_range_count * 2) / self._character_count # type: float
  152. if ratio_of_suspicious_range_usage < 0.1:
  153. return 0.
  154. return ratio_of_suspicious_range_usage
  155. class SuperWeirdWordPlugin(MessDetectorPlugin):
  156. def __init__(self):
  157. self._word_count = 0 # type: int
  158. self._bad_word_count = 0 # type: int
  159. self._is_current_word_bad = False # type: bool
  160. self._foreign_long_watch = False # type: bool
  161. self._character_count = 0 # type: int
  162. self._bad_character_count = 0 # type: int
  163. self._buffer = "" # type: str
  164. self._buffer_accent_count = 0 # type: int
  165. def eligible(self, character: str) -> bool:
  166. return True
  167. def feed(self, character: str) -> None:
  168. if character.isalpha():
  169. self._buffer = "".join([self._buffer, character])
  170. if is_accentuated(character):
  171. self._buffer_accent_count += 1
  172. if self._foreign_long_watch is False and is_latin(character) is False and is_cjk(character) is False and is_hangul(character) is False and is_katakana(character) is False and is_hiragana(character) is False and is_thai(character) is False:
  173. self._foreign_long_watch = True
  174. return
  175. if not self._buffer:
  176. return
  177. if (character.isspace() or is_punctuation(character) or is_separator(character)) and self._buffer:
  178. self._word_count += 1
  179. buffer_length = len(self._buffer) # type: int
  180. self._character_count += buffer_length
  181. if buffer_length >= 4 and self._buffer_accent_count / buffer_length >= 0.3:
  182. self._is_current_word_bad = True
  183. if buffer_length >= 24 and self._foreign_long_watch:
  184. self._is_current_word_bad = True
  185. if self._is_current_word_bad:
  186. self._bad_word_count += 1
  187. self._bad_character_count += len(self._buffer)
  188. self._is_current_word_bad = False
  189. self._foreign_long_watch = False
  190. self._buffer = ""
  191. self._buffer_accent_count = 0
  192. elif character not in {"<", ">", "-", "="} and character.isdigit() is False and is_symbol(character):
  193. self._is_current_word_bad = True
  194. self._buffer += character
  195. def reset(self) -> None:
  196. self._buffer = ""
  197. self._is_current_word_bad = False
  198. self._foreign_long_watch = False
  199. self._bad_word_count = 0
  200. self._word_count = 0
  201. self._character_count = 0
  202. self._bad_character_count = 0
  203. @property
  204. def ratio(self) -> float:
  205. if self._word_count <= 10:
  206. return 0.
  207. return self._bad_character_count / self._character_count
  208. class CjkInvalidStopPlugin(MessDetectorPlugin):
  209. """
  210. GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and can be easily detected.
  211. Searching for the overuse of '丅' and '丄'.
  212. """
  213. def __init__(self):
  214. self._wrong_stop_count = 0 # type: int
  215. self._cjk_character_count = 0 # type: int
  216. def eligible(self, character: str) -> bool:
  217. return True
  218. def feed(self, character: str) -> None:
  219. if character in ["丅", "丄"]:
  220. self._wrong_stop_count += 1
  221. return
  222. if is_cjk(character):
  223. self._cjk_character_count += 1
  224. def reset(self) -> None:
  225. self._wrong_stop_count = 0
  226. self._cjk_character_count = 0
  227. @property
  228. def ratio(self) -> float:
  229. if self._cjk_character_count < 16:
  230. return 0.
  231. return self._wrong_stop_count / self._cjk_character_count
  232. class ArchaicUpperLowerPlugin(MessDetectorPlugin):
  233. def __init__(self):
  234. self._buf = False # type: bool
  235. self._character_count_since_last_sep = 0 # type: int
  236. self._successive_upper_lower_count = 0 # type: int
  237. self._successive_upper_lower_count_final = 0 # type: int
  238. self._character_count = 0 # type: int
  239. self._last_alpha_seen = None # type: Optional[str]
  240. self._current_ascii_only = True # type: bool
  241. def eligible(self, character: str) -> bool:
  242. return True
  243. def feed(self, character: str) -> None:
  244. is_concerned = character.isalpha() and is_case_variable(character)
  245. chunk_sep = is_concerned is False
  246. if chunk_sep and self._character_count_since_last_sep > 0:
  247. if self._character_count_since_last_sep <= 64 and character.isdigit() is False and self._current_ascii_only is False:
  248. self._successive_upper_lower_count_final += self._successive_upper_lower_count
  249. self._successive_upper_lower_count = 0
  250. self._character_count_since_last_sep = 0
  251. self._last_alpha_seen = None
  252. self._buf = False
  253. self._character_count += 1
  254. self._current_ascii_only = True
  255. return
  256. if self._current_ascii_only is True and is_ascii(character) is False:
  257. self._current_ascii_only = False
  258. if self._last_alpha_seen is not None:
  259. if (character.isupper() and self._last_alpha_seen.islower()) or (character.islower() and self._last_alpha_seen.isupper()):
  260. if self._buf is True:
  261. self._successive_upper_lower_count += 2
  262. self._buf = False
  263. else:
  264. self._buf = True
  265. else:
  266. self._buf = False
  267. self._character_count += 1
  268. self._character_count_since_last_sep += 1
  269. self._last_alpha_seen = character
  270. def reset(self) -> None:
  271. self._character_count = 0
  272. self._character_count_since_last_sep = 0
  273. self._successive_upper_lower_count = 0
  274. self._successive_upper_lower_count_final = 0
  275. self._last_alpha_seen = None
  276. self._buf = False
  277. self._current_ascii_only = True
  278. @property
  279. def ratio(self) -> float:
  280. if self._character_count == 0:
  281. return 0.
  282. return self._successive_upper_lower_count_final / self._character_count
  283. def is_suspiciously_successive_range(unicode_range_a: Optional[str], unicode_range_b: Optional[str]) -> bool:
  284. """
  285. Determine if two Unicode range seen next to each other can be considered as suspicious.
  286. """
  287. if unicode_range_a is None or unicode_range_b is None:
  288. return True
  289. if unicode_range_a == unicode_range_b:
  290. return False
  291. if "Latin" in unicode_range_a and "Latin" in unicode_range_b:
  292. return False
  293. if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:
  294. return False
  295. keywords_range_a, keywords_range_b = unicode_range_a.split(" "), unicode_range_b.split(" ")
  296. for el in keywords_range_a:
  297. if el in UNICODE_SECONDARY_RANGE_KEYWORD:
  298. continue
  299. if el in keywords_range_b:
  300. return False
  301. # Japanese Exception
  302. if unicode_range_a in ['Katakana', 'Hiragana'] and unicode_range_b in ['Katakana', 'Hiragana']:
  303. return False
  304. if unicode_range_a in ['Katakana', 'Hiragana'] or unicode_range_b in ['Katakana', 'Hiragana']:
  305. if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
  306. return False
  307. if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
  308. if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
  309. return False
  310. if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
  311. return False
  312. # Chinese/Japanese use dedicated range for punctuation and/or separators.
  313. if ('CJK' in unicode_range_a or 'CJK' in unicode_range_b) or (unicode_range_a in ['Katakana', 'Hiragana'] and unicode_range_b in ['Katakana', 'Hiragana']):
  314. if 'Punctuation' in unicode_range_a or 'Punctuation' in unicode_range_b:
  315. return False
  316. if 'Forms' in unicode_range_a or 'Forms' in unicode_range_b:
  317. return False
  318. return True
  319. @lru_cache(maxsize=2048)
  320. def mess_ratio(decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False) -> float:
  321. """
  322. Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
  323. """
  324. detectors = [] # type: List[MessDetectorPlugin]
  325. for md_class in MessDetectorPlugin.__subclasses__():
  326. detectors.append(
  327. md_class()
  328. )
  329. length = len(decoded_sequence) # type: int
  330. mean_mess_ratio = 0. # type: float
  331. if length < 512:
  332. intermediary_mean_mess_ratio_calc = 32 # type: int
  333. elif length <= 1024:
  334. intermediary_mean_mess_ratio_calc = 64
  335. else:
  336. intermediary_mean_mess_ratio_calc = 128
  337. for character, index in zip(decoded_sequence, range(0, length)):
  338. for detector in detectors:
  339. if detector.eligible(character):
  340. detector.feed(character)
  341. if (index > 0 and index % intermediary_mean_mess_ratio_calc == 0) or index == length-1:
  342. mean_mess_ratio = sum(
  343. [
  344. dt.ratio for dt in detectors
  345. ]
  346. )
  347. if mean_mess_ratio >= maximum_threshold:
  348. break
  349. if debug:
  350. for dt in detectors: # pragma: nocover
  351. print(
  352. dt.__class__,
  353. dt.ratio
  354. )
  355. return round(
  356. mean_mess_ratio,
  357. 3
  358. )