common.py 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193
  1. """
  2. @author: luojunhui
  3. """
  4. import hashlib
  5. from datetime import datetime, timezone, date, timedelta
  6. from requests import RequestException
  7. from urllib.parse import urlparse, parse_qs
  8. from tenacity import (
  9. stop_after_attempt,
  10. wait_exponential,
  11. retry_if_exception_type,
  12. )
  13. def str_to_md5(strings):
  14. """
  15. 字符串转化为 md5 值
  16. :param strings:
  17. :return:
  18. """
  19. # 将字符串转换为字节
  20. original_bytes = strings.encode("utf-8")
  21. # 创建一个md5 hash对象
  22. md5_hash = hashlib.md5()
  23. # 更新hash对象,传入原始字节
  24. md5_hash.update(original_bytes)
  25. # 获取16进制形式的MD5哈希值
  26. md5_value = md5_hash.hexdigest()
  27. return md5_value
  28. def proxy():
  29. """
  30. 快代理
  31. """
  32. # 隧道域名:端口号
  33. tunnel = "j685.kdltps.com:15818"
  34. # 用户名密码方式
  35. username = "t14070979713487"
  36. password = "hqwanfvy"
  37. proxies = {
  38. "http": "http://%(user)s:%(pwd)s@%(proxy)s/"
  39. % {"user": username, "pwd": password, "proxy": tunnel},
  40. "https": "http://%(user)s:%(pwd)s@%(proxy)s/"
  41. % {"user": username, "pwd": password, "proxy": tunnel},
  42. }
  43. return proxies
  44. def async_proxy():
  45. return {
  46. "url": "http://j685.kdltps.com:15818",
  47. "username": "t14070979713487",
  48. "password": "hqwanfvy",
  49. }
  50. def request_retry(retry_times, min_retry_delay, max_retry_delay):
  51. """
  52. :param retry_times:
  53. :param min_retry_delay:
  54. :param max_retry_delay:
  55. """
  56. common_retry = dict(
  57. stop=stop_after_attempt(retry_times),
  58. wait=wait_exponential(min=min_retry_delay, max=max_retry_delay),
  59. retry=retry_if_exception_type((RequestException, TimeoutError)),
  60. reraise=True, # 重试耗尽后重新抛出异常
  61. )
  62. return common_retry
  63. def yield_batch(data, batch_size):
  64. """
  65. 生成批次数据
  66. :param data:
  67. :param batch_size:
  68. :return:
  69. """
  70. for i in range(0, len(data), batch_size):
  71. yield data[i : i + batch_size]
  72. def extract_root_source_id(path: str) -> dict:
  73. """
  74. 提取path参数
  75. :param path:
  76. :return:
  77. """
  78. params = parse_qs(urlparse(path).query)
  79. jump_page = params.get("jumpPage", [None])[0]
  80. if jump_page:
  81. params2 = parse_qs(jump_page)
  82. res = {
  83. "video_id": params2["pages/user-videos?id"][0],
  84. "root_source_id": params2["rootSourceId"][0],
  85. }
  86. return res
  87. else:
  88. return {}
  89. def show_desc_to_sta(show_desc):
  90. def decode_show_v(show_v):
  91. """
  92. :param show_v:
  93. :return:
  94. """
  95. foo = show_v.replace("千", "e3").replace("万", "e4").replace("亿", "e8")
  96. foo = eval(foo)
  97. return int(foo)
  98. def decode_show_k(show_k):
  99. """
  100. :param show_k:
  101. :return:
  102. """
  103. this_dict = {
  104. "阅读": "show_view_count", # 文章
  105. "看过": "show_view_count", # 图文
  106. "观看": "show_view_count", # 视频
  107. "赞": "show_like_count",
  108. "付费": "show_pay_count",
  109. "赞赏": "show_zs_count",
  110. }
  111. if show_k not in this_dict:
  112. print(f"error from decode_show_k, show_k not found: {show_k}")
  113. return this_dict.get(show_k, "show_unknown")
  114. show_desc = show_desc.replace("+", "")
  115. sta = {}
  116. for show_kv in show_desc.split("\u2004\u2005"):
  117. if not show_kv:
  118. continue
  119. show_k, show_v = show_kv.split("\u2006")
  120. k = decode_show_k(show_k)
  121. v = decode_show_v(show_v)
  122. sta[k] = v
  123. res = {
  124. "show_view_count": sta.get("show_view_count", 0),
  125. "show_like_count": sta.get("show_like_count", 0),
  126. "show_pay_count": sta.get("show_pay_count", 0),
  127. "show_zs_count": sta.get("show_zs_count", 0),
  128. }
  129. return res
  130. def generate_gzh_id(url):
  131. biz = url.split("biz=")[1].split("&")[0]
  132. idx = url.split("&idx=")[1].split("&")[0]
  133. sn = url.split("&sn=")[1].split("&")[0]
  134. url_bit = "{}-{}-{}".format(biz, idx, sn).encode()
  135. md5_hash = hashlib.md5()
  136. md5_hash.update(url_bit)
  137. md5_value = md5_hash.hexdigest()
  138. return md5_value
  139. def timestamp_to_str(timestamp, string_format="%Y-%m-%d %H:%M:%S") -> str:
  140. """
  141. :param string_format:
  142. :param timestamp:
  143. """
  144. dt_object = (
  145. datetime.utcfromtimestamp(timestamp).replace(tzinfo=timezone.utc).astimezone()
  146. )
  147. date_string = dt_object.strftime(string_format)
  148. return date_string
  149. def days_remaining_in_month():
  150. # 获取当前日期
  151. today = date.today()
  152. # 获取下个月的第一天
  153. if today.month == 12:
  154. next_month = today.replace(year=today.year + 1, month=1, day=1)
  155. else:
  156. next_month = today.replace(month=today.month + 1, day=1)
  157. # 计算本月最后一天(下个月第一天减去1天)
  158. last_day_of_month = next_month - timedelta(days=1)
  159. # 计算剩余天数
  160. remaining_days = (last_day_of_month - today).days
  161. return remaining_days