common.py 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220
  1. """
  2. @author: luojunhui
  3. """
  4. import random
  5. import string
  6. import hashlib
  7. import math
  8. import statistics
  9. from scipy.stats import t
  10. from datetime import datetime, timezone, date, timedelta
  11. from typing import List
  12. from requests import RequestException
  13. from urllib.parse import urlparse, parse_qs
  14. from tenacity import (
  15. stop_after_attempt,
  16. wait_exponential,
  17. retry_if_exception_type,
  18. )
  19. def str_to_md5(strings):
  20. """
  21. 字符串转化为 md5 值
  22. :param strings:
  23. :return:
  24. """
  25. # 将字符串转换为字节
  26. original_bytes = strings.encode("utf-8")
  27. # 创建一个md5 hash对象
  28. md5_hash = hashlib.md5()
  29. # 更新hash对象,传入原始字节
  30. md5_hash.update(original_bytes)
  31. # 获取16进制形式的MD5哈希值
  32. md5_value = md5_hash.hexdigest()
  33. return md5_value
  34. def proxy():
  35. """
  36. 快代理
  37. """
  38. # 隧道域名:端口号
  39. tunnel = "j685.kdltps.com:15818"
  40. # 用户名密码方式
  41. username = "t14070979713487"
  42. password = "hqwanfvy"
  43. proxies = {
  44. "http": "http://%(user)s:%(pwd)s@%(proxy)s/"
  45. % {"user": username, "pwd": password, "proxy": tunnel},
  46. "https": "http://%(user)s:%(pwd)s@%(proxy)s/"
  47. % {"user": username, "pwd": password, "proxy": tunnel},
  48. }
  49. return proxies
  50. def async_proxy():
  51. return {
  52. "url": "http://j685.kdltps.com:15818",
  53. "username": "t14070979713487",
  54. "password": "hqwanfvy",
  55. }
  56. def request_retry(retry_times, min_retry_delay, max_retry_delay):
  57. """
  58. :param retry_times:
  59. :param min_retry_delay:
  60. :param max_retry_delay:
  61. """
  62. common_retry = dict(
  63. stop=stop_after_attempt(retry_times),
  64. wait=wait_exponential(min=min_retry_delay, max=max_retry_delay),
  65. retry=retry_if_exception_type((RequestException, TimeoutError)),
  66. reraise=True, # 重试耗尽后重新抛出异常
  67. )
  68. return common_retry
  69. def yield_batch(data, batch_size):
  70. """
  71. 生成批次数据
  72. :param data:
  73. :param batch_size:
  74. :return:
  75. """
  76. for i in range(0, len(data), batch_size):
  77. yield data[i : i + batch_size]
  78. def extract_root_source_id(path: str) -> dict:
  79. """
  80. 提取path参数
  81. :param path:
  82. :return:
  83. """
  84. params = parse_qs(urlparse(path).query)
  85. jump_page = params.get("jumpPage", [None])[0]
  86. if jump_page:
  87. params2 = parse_qs(jump_page)
  88. res = {
  89. "video_id": params2["pages/user-videos?id"][0],
  90. "root_source_id": params2["rootSourceId"][0],
  91. }
  92. return res
  93. else:
  94. return {}
  95. def show_desc_to_sta(show_desc):
  96. def decode_show_v(show_v):
  97. """
  98. :param show_v:
  99. :return:
  100. """
  101. foo = show_v.replace("千", "e3").replace("万", "e4").replace("亿", "e8")
  102. foo = eval(foo)
  103. return int(foo)
  104. def decode_show_k(show_k):
  105. """
  106. :param show_k:
  107. :return:
  108. """
  109. this_dict = {
  110. "阅读": "show_view_count", # 文章
  111. "看过": "show_view_count", # 图文
  112. "观看": "show_view_count", # 视频
  113. "赞": "show_like_count",
  114. "付费": "show_pay_count",
  115. "赞赏": "show_zs_count",
  116. }
  117. if show_k not in this_dict:
  118. print(f"error from decode_show_k, show_k not found: {show_k}")
  119. return this_dict.get(show_k, "show_unknown")
  120. show_desc = show_desc.replace("+", "")
  121. sta = {}
  122. for show_kv in show_desc.split("\u2004\u2005"):
  123. if not show_kv:
  124. continue
  125. show_k, show_v = show_kv.split("\u2006")
  126. k = decode_show_k(show_k)
  127. v = decode_show_v(show_v)
  128. sta[k] = v
  129. res = {
  130. "show_view_count": sta.get("show_view_count", 0),
  131. "show_like_count": sta.get("show_like_count", 0),
  132. "show_pay_count": sta.get("show_pay_count", 0),
  133. "show_zs_count": sta.get("show_zs_count", 0),
  134. }
  135. return res
  136. def generate_gzh_id(url):
  137. biz = url.split("biz=")[1].split("&")[0]
  138. idx = url.split("&idx=")[1].split("&")[0]
  139. sn = url.split("&sn=")[1].split("&")[0]
  140. url_bit = "{}-{}-{}".format(biz, idx, sn).encode()
  141. md5_hash = hashlib.md5()
  142. md5_hash.update(url_bit)
  143. md5_value = md5_hash.hexdigest()
  144. return md5_value
  145. def timestamp_to_str(timestamp, string_format="%Y-%m-%d %H:%M:%S") -> str:
  146. """
  147. :param string_format:
  148. :param timestamp:
  149. """
  150. dt_object = (
  151. datetime.utcfromtimestamp(timestamp).replace(tzinfo=timezone.utc).astimezone()
  152. )
  153. date_string = dt_object.strftime(string_format)
  154. return date_string
  155. def days_remaining_in_month():
  156. # 获取当前日期
  157. today = date.today()
  158. # 获取下个月的第一天
  159. if today.month == 12:
  160. next_month = today.replace(year=today.year + 1, month=1, day=1)
  161. else:
  162. next_month = today.replace(month=today.month + 1, day=1)
  163. # 计算本月最后一天(下个月第一天减去1天)
  164. last_day_of_month = next_month - timedelta(days=1)
  165. # 计算剩余天数
  166. remaining_days = (last_day_of_month - today).days
  167. return remaining_days
  168. def generate_task_trace_id():
  169. random_str = "".join(random.choices(string.ascii_lowercase + string.digits, k=16))
  170. return f"Task-{datetime.now().strftime('%Y%m%d%H%M%S')}-{random_str}"
  171. def ci_lower(data: List[int], conf: float = 0.95) -> float:
  172. """
  173. 计算data的置信区间下限
  174. """
  175. if len(data) < 2:
  176. raise ValueError("Sample length less than 2")
  177. n = len(data)
  178. mean = statistics.mean(data)
  179. std = statistics.stdev(data) / math.sqrt(n)
  180. # t 分位点(左侧):ppf 返回负值
  181. t_left = t.ppf((1 - conf) / 2, df=n - 1)
  182. return mean + t_left * std