common.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200
  1. """
  2. @author: luojunhui
  3. """
  4. import random
  5. import string
  6. import hashlib
  7. from datetime import datetime, timezone, date, timedelta
  8. from requests import RequestException
  9. from urllib.parse import urlparse, parse_qs
  10. from tenacity import (
  11. stop_after_attempt,
  12. wait_exponential,
  13. retry_if_exception_type,
  14. )
  15. def str_to_md5(strings):
  16. """
  17. 字符串转化为 md5 值
  18. :param strings:
  19. :return:
  20. """
  21. # 将字符串转换为字节
  22. original_bytes = strings.encode("utf-8")
  23. # 创建一个md5 hash对象
  24. md5_hash = hashlib.md5()
  25. # 更新hash对象,传入原始字节
  26. md5_hash.update(original_bytes)
  27. # 获取16进制形式的MD5哈希值
  28. md5_value = md5_hash.hexdigest()
  29. return md5_value
  30. def proxy():
  31. """
  32. 快代理
  33. """
  34. # 隧道域名:端口号
  35. tunnel = "j685.kdltps.com:15818"
  36. # 用户名密码方式
  37. username = "t14070979713487"
  38. password = "hqwanfvy"
  39. proxies = {
  40. "http": "http://%(user)s:%(pwd)s@%(proxy)s/"
  41. % {"user": username, "pwd": password, "proxy": tunnel},
  42. "https": "http://%(user)s:%(pwd)s@%(proxy)s/"
  43. % {"user": username, "pwd": password, "proxy": tunnel},
  44. }
  45. return proxies
  46. def async_proxy():
  47. return {
  48. "url": "http://j685.kdltps.com:15818",
  49. "username": "t14070979713487",
  50. "password": "hqwanfvy",
  51. }
  52. def request_retry(retry_times, min_retry_delay, max_retry_delay):
  53. """
  54. :param retry_times:
  55. :param min_retry_delay:
  56. :param max_retry_delay:
  57. """
  58. common_retry = dict(
  59. stop=stop_after_attempt(retry_times),
  60. wait=wait_exponential(min=min_retry_delay, max=max_retry_delay),
  61. retry=retry_if_exception_type((RequestException, TimeoutError)),
  62. reraise=True, # 重试耗尽后重新抛出异常
  63. )
  64. return common_retry
  65. def yield_batch(data, batch_size):
  66. """
  67. 生成批次数据
  68. :param data:
  69. :param batch_size:
  70. :return:
  71. """
  72. for i in range(0, len(data), batch_size):
  73. yield data[i : i + batch_size]
  74. def extract_root_source_id(path: str) -> dict:
  75. """
  76. 提取path参数
  77. :param path:
  78. :return:
  79. """
  80. params = parse_qs(urlparse(path).query)
  81. jump_page = params.get("jumpPage", [None])[0]
  82. if jump_page:
  83. params2 = parse_qs(jump_page)
  84. res = {
  85. "video_id": params2["pages/user-videos?id"][0],
  86. "root_source_id": params2["rootSourceId"][0],
  87. }
  88. return res
  89. else:
  90. return {}
  91. def show_desc_to_sta(show_desc):
  92. def decode_show_v(show_v):
  93. """
  94. :param show_v:
  95. :return:
  96. """
  97. foo = show_v.replace("千", "e3").replace("万", "e4").replace("亿", "e8")
  98. foo = eval(foo)
  99. return int(foo)
  100. def decode_show_k(show_k):
  101. """
  102. :param show_k:
  103. :return:
  104. """
  105. this_dict = {
  106. "阅读": "show_view_count", # 文章
  107. "看过": "show_view_count", # 图文
  108. "观看": "show_view_count", # 视频
  109. "赞": "show_like_count",
  110. "付费": "show_pay_count",
  111. "赞赏": "show_zs_count",
  112. }
  113. if show_k not in this_dict:
  114. print(f"error from decode_show_k, show_k not found: {show_k}")
  115. return this_dict.get(show_k, "show_unknown")
  116. show_desc = show_desc.replace("+", "")
  117. sta = {}
  118. for show_kv in show_desc.split("\u2004\u2005"):
  119. if not show_kv:
  120. continue
  121. show_k, show_v = show_kv.split("\u2006")
  122. k = decode_show_k(show_k)
  123. v = decode_show_v(show_v)
  124. sta[k] = v
  125. res = {
  126. "show_view_count": sta.get("show_view_count", 0),
  127. "show_like_count": sta.get("show_like_count", 0),
  128. "show_pay_count": sta.get("show_pay_count", 0),
  129. "show_zs_count": sta.get("show_zs_count", 0),
  130. }
  131. return res
  132. def generate_gzh_id(url):
  133. biz = url.split("biz=")[1].split("&")[0]
  134. idx = url.split("&idx=")[1].split("&")[0]
  135. sn = url.split("&sn=")[1].split("&")[0]
  136. url_bit = "{}-{}-{}".format(biz, idx, sn).encode()
  137. md5_hash = hashlib.md5()
  138. md5_hash.update(url_bit)
  139. md5_value = md5_hash.hexdigest()
  140. return md5_value
  141. def timestamp_to_str(timestamp, string_format="%Y-%m-%d %H:%M:%S") -> str:
  142. """
  143. :param string_format:
  144. :param timestamp:
  145. """
  146. dt_object = (
  147. datetime.utcfromtimestamp(timestamp).replace(tzinfo=timezone.utc).astimezone()
  148. )
  149. date_string = dt_object.strftime(string_format)
  150. return date_string
  151. def days_remaining_in_month():
  152. # 获取当前日期
  153. today = date.today()
  154. # 获取下个月的第一天
  155. if today.month == 12:
  156. next_month = today.replace(year=today.year + 1, month=1, day=1)
  157. else:
  158. next_month = today.replace(month=today.month + 1, day=1)
  159. # 计算本月最后一天(下个月第一天减去1天)
  160. last_day_of_month = next_month - timedelta(days=1)
  161. # 计算剩余天数
  162. remaining_days = (last_day_of_month - today).days
  163. return remaining_days
  164. def generate_task_trace_id():
  165. random_str = "".join(random.choices(string.ascii_lowercase + string.digits, k=16))
  166. return f"Task-{datetime.now().strftime('%Y%m%d%H%M%S')}-{random_str}"