common.py 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263
  1. """
  2. @author: luojunhui
  3. """
  4. import hashlib
  5. import json
  6. import re
  7. from typing import Dict, List, Optional
  8. from datetime import datetime, timezone, date, timedelta
  9. from requests import RequestException
  10. from urllib.parse import urlparse, parse_qs
  11. from tenacity import (
  12. stop_after_attempt,
  13. wait_exponential,
  14. retry_if_exception_type,
  15. )
  16. def str_to_md5(strings):
  17. """
  18. 字符串转化为 md5 值
  19. :param strings:
  20. :return:
  21. """
  22. # 将字符串转换为字节
  23. original_bytes = strings.encode("utf-8")
  24. # 创建一个md5 hash对象
  25. md5_hash = hashlib.md5()
  26. # 更新hash对象,传入原始字节
  27. md5_hash.update(original_bytes)
  28. # 获取16进制形式的MD5哈希值
  29. md5_value = md5_hash.hexdigest()
  30. return md5_value
  31. def proxy():
  32. """
  33. 快代理
  34. """
  35. # 隧道域名:端口号
  36. tunnel = "j685.kdltps.com:15818"
  37. # 用户名密码方式
  38. username = "t16899444538299"
  39. password = "5w5ersso"
  40. proxies = {
  41. "http": "http://%(user)s:%(pwd)s@%(proxy)s/"
  42. % {"user": username, "pwd": password, "proxy": tunnel},
  43. "https": "http://%(user)s:%(pwd)s@%(proxy)s/"
  44. % {"user": username, "pwd": password, "proxy": tunnel},
  45. }
  46. return proxies
  47. def request_retry(retry_times, min_retry_delay, max_retry_delay):
  48. """
  49. :param retry_times:
  50. :param min_retry_delay:
  51. :param max_retry_delay:
  52. """
  53. common_retry = dict(
  54. stop=stop_after_attempt(retry_times),
  55. wait=wait_exponential(min=min_retry_delay, max=max_retry_delay),
  56. retry=retry_if_exception_type((RequestException, TimeoutError)),
  57. reraise=True, # 重试耗尽后重新抛出异常
  58. )
  59. return common_retry
  60. def yield_batch(data, batch_size):
  61. """
  62. 生成批次数据
  63. :param data:
  64. :param batch_size:
  65. :return:
  66. """
  67. for i in range(0, len(data), batch_size):
  68. yield data[i : i + batch_size]
  69. def extract_root_source_id(path: str) -> dict:
  70. """
  71. 提取path参数
  72. :param path:
  73. :return:
  74. """
  75. params = parse_qs(urlparse(path).query)
  76. jump_page = params.get("jumpPage", [None])[0]
  77. if jump_page:
  78. params2 = parse_qs(jump_page)
  79. res = {
  80. "video_id": params2["pages/user-videos?id"][0],
  81. "root_source_id": params2["rootSourceId"][0],
  82. }
  83. return res
  84. else:
  85. return {}
  86. def show_desc_to_sta(show_desc):
  87. def decode_show_v(show_v):
  88. """
  89. :param show_v:
  90. :return:
  91. """
  92. foo = show_v.replace("千", "e3").replace("万", "e4").replace("亿", "e8")
  93. foo = eval(foo)
  94. return int(foo)
  95. def decode_show_k(show_k):
  96. """
  97. :param show_k:
  98. :return:
  99. """
  100. this_dict = {
  101. "阅读": "show_view_count", # 文章
  102. "看过": "show_view_count", # 图文
  103. "观看": "show_view_count", # 视频
  104. "赞": "show_like_count",
  105. "付费": "show_pay_count",
  106. "赞赏": "show_zs_count",
  107. }
  108. if show_k not in this_dict:
  109. print(f"error from decode_show_k, show_k not found: {show_k}")
  110. return this_dict.get(show_k, "show_unknown")
  111. show_desc = show_desc.replace("+", "")
  112. sta = {}
  113. for show_kv in show_desc.split("\u2004\u2005"):
  114. if not show_kv:
  115. continue
  116. show_k, show_v = show_kv.split("\u2006")
  117. k = decode_show_k(show_k)
  118. v = decode_show_v(show_v)
  119. sta[k] = v
  120. res = {
  121. "show_view_count": sta.get("show_view_count", 0),
  122. "show_like_count": sta.get("show_like_count", 0),
  123. "show_pay_count": sta.get("show_pay_count", 0),
  124. "show_zs_count": sta.get("show_zs_count", 0),
  125. }
  126. return res
  127. def generate_gzh_id(url):
  128. biz = url.split("biz=")[1].split("&")[0]
  129. idx = url.split("&idx=")[1].split("&")[0]
  130. sn = url.split("&sn=")[1].split("&")[0]
  131. url_bit = "{}-{}-{}".format(biz, idx, sn).encode()
  132. md5_hash = hashlib.md5()
  133. md5_hash.update(url_bit)
  134. md5_value = md5_hash.hexdigest()
  135. return md5_value
  136. def timestamp_to_str(timestamp, string_format="%Y-%m-%d %H:%M:%S") -> str:
  137. """
  138. :param string_format:
  139. :param timestamp:
  140. """
  141. dt_object = (
  142. datetime.utcfromtimestamp(timestamp).replace(tzinfo=timezone.utc).astimezone()
  143. )
  144. date_string = dt_object.strftime(string_format)
  145. return date_string
  146. def days_remaining_in_month():
  147. # 获取当前日期
  148. today = date.today()
  149. # 获取下个月的第一天
  150. if today.month == 12:
  151. next_month = today.replace(year=today.year + 1, month=1, day=1)
  152. else:
  153. next_month = today.replace(month=today.month + 1, day=1)
  154. # 计算本月最后一天(下个月第一天减去1天)
  155. last_day_of_month = next_month - timedelta(days=1)
  156. # 计算剩余天数
  157. remaining_days = (last_day_of_month - today).days
  158. return remaining_days
  159. def safe_json_parse(text: str) -> Optional[Dict | List]:
  160. """多层降级解析 JSON:直接解析 → 提取代码块 → 提取 JSON 对象/数组
  161. 模型有时返回 ```json ... ``` 包裹的文本,或文本中夹杂 markdown 前缀/后缀。
  162. 先尝试直接解析(最常见路径),失败后逐层降级提取。
  163. """
  164. if not text:
  165. return None
  166. # 降级 1:直接解析
  167. try:
  168. return json.loads(text)
  169. except (json.JSONDecodeError, TypeError):
  170. pass
  171. clean = text.strip()
  172. # 降级 2:提取最外层 json 代码块 ```json ... ```
  173. # 优先匹配带语言标注的,再退到任意 code fence
  174. m = re.search(r"```json\s*(.*?)\s*```", clean, re.DOTALL)
  175. if m:
  176. try:
  177. return json.loads(m.group(1))
  178. except (json.JSONDecodeError, TypeError):
  179. pass
  180. else:
  181. m = re.search(r"```\s*(.*?)\s*```", clean, re.DOTALL)
  182. if m:
  183. try:
  184. return json.loads(m.group(1))
  185. except (json.JSONDecodeError, TypeError):
  186. pass
  187. # 降级 3:在文本中查找第一个完整 JSON 对象 { ... } 或数组 [ ... ]
  188. # 逐字符扫描,维护字符串状态机,正确处理内嵌括号和转义引号
  189. for bracket_pair in [("{}", "{", "}"), ("[]", "[", "]")]:
  190. opener, closer = bracket_pair[1], bracket_pair[2]
  191. start = clean.find(opener)
  192. if start == -1:
  193. continue
  194. depth = 0
  195. in_string = False
  196. escape_next = False
  197. for i in range(start, len(clean)):
  198. ch = clean[i]
  199. if escape_next:
  200. escape_next = False
  201. continue
  202. if ch == "\\":
  203. escape_next = True
  204. continue
  205. if ch == '"' and not escape_next:
  206. in_string = not in_string
  207. continue
  208. if in_string:
  209. continue
  210. if ch == opener:
  211. depth += 1
  212. elif ch == closer:
  213. depth -= 1
  214. if depth == 0:
  215. try:
  216. return json.loads(clean[start : i + 1])
  217. except (json.JSONDecodeError, TypeError):
  218. return None
  219. # 数组或对象未闭合时也尝试下
  220. try:
  221. return json.loads(clean[start:])
  222. except (json.JSONDecodeError, TypeError):
  223. pass
  224. return None