tools.py 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339
  1. """
  2. @author: luojunhui
  3. """
  4. import re
  5. import oss2
  6. import random
  7. import string
  8. import hashlib
  9. import math
  10. import statistics
  11. from scipy.stats import t
  12. from odps import ODPS
  13. from datetime import datetime, timezone, date, timedelta
  14. from typing import List
  15. from requests import RequestException
  16. from urllib.parse import urlparse, parse_qs
  17. from tenacity import (
  18. stop_after_attempt,
  19. wait_exponential,
  20. retry_if_exception_type,
  21. )
  22. def str_to_md5(strings):
  23. """
  24. 字符串转化为 md5 值
  25. :param strings:
  26. :return:
  27. """
  28. # 将字符串转换为字节
  29. original_bytes = strings.encode("utf-8")
  30. # 创建一个md5 hash对象
  31. md5_hash = hashlib.md5()
  32. # 更新hash对象,传入原始字节
  33. md5_hash.update(original_bytes)
  34. # 获取16进制形式的MD5哈希值
  35. md5_value = md5_hash.hexdigest()
  36. return md5_value
  37. def proxy():
  38. """
  39. 快代理
  40. """
  41. # 隧道域名:端口号
  42. tunnel = "j685.kdltps.com:15818"
  43. # 用户名密码方式
  44. username = "t14070979713487"
  45. password = "hqwanfvy"
  46. proxies = {
  47. "http": "http://%(user)s:%(pwd)s@%(proxy)s/"
  48. % {"user": username, "pwd": password, "proxy": tunnel},
  49. "https": "http://%(user)s:%(pwd)s@%(proxy)s/"
  50. % {"user": username, "pwd": password, "proxy": tunnel},
  51. }
  52. return proxies
  53. def async_proxy():
  54. return {
  55. "url": "http://j685.kdltps.com:15818",
  56. "username": "t14070979713487",
  57. "password": "hqwanfvy",
  58. }
  59. def request_retry(retry_times, min_retry_delay, max_retry_delay):
  60. """
  61. :param retry_times:
  62. :param min_retry_delay:
  63. :param max_retry_delay:
  64. """
  65. common_retry = dict(
  66. stop=stop_after_attempt(retry_times),
  67. wait=wait_exponential(min=min_retry_delay, max=max_retry_delay),
  68. retry=retry_if_exception_type((RequestException, TimeoutError)),
  69. reraise=True, # 重试耗尽后重新抛出异常
  70. )
  71. return common_retry
  72. def yield_batch(data, batch_size):
  73. """
  74. 生成批次数据
  75. :param data:
  76. :param batch_size:
  77. :return:
  78. """
  79. for i in range(0, len(data), batch_size):
  80. yield data[i : i + batch_size]
  81. def extract_root_source_id(path: str) -> dict:
  82. """
  83. 提取path参数
  84. :param path:
  85. :return:
  86. """
  87. params = parse_qs(urlparse(path).query)
  88. jump_page = params.get("jumpPage", [None])[0]
  89. if jump_page:
  90. params2 = parse_qs(jump_page)
  91. res = {
  92. "video_id": params2["pages/user-videos?id"][0],
  93. "root_source_id": params2["rootSourceId"][0],
  94. }
  95. return res
  96. else:
  97. return {}
  98. def show_desc_to_sta(show_desc: str):
  99. def decode_show_v(show_v: str) -> int:
  100. """
  101. 解析数值(全球通用):
  102. 支持:
  103. - 中文:1.3万 / 2千 / 5亿
  104. - 英文:13k / 2.5m / 1.2b
  105. - 混合:1.2万阅读 / 13k views
  106. """
  107. if not show_v:
  108. return 0
  109. show_v = show_v.strip().lower()
  110. # 防止欧洲小数格式:1,3k
  111. show_v = show_v.replace(",", ".")
  112. # 提取 数字 + 单位
  113. match = re.search(r"(\d+(?:\.\d+)?)([a-z\u4e00-\u9fa5]*)", show_v)
  114. if not match:
  115. return 0
  116. num = float(match.group(1))
  117. unit = match.group(2)
  118. # 中文单位
  119. if "亿" in unit:
  120. num *= 1e8
  121. elif "万" in unit:
  122. num *= 1e4
  123. elif "千" in unit:
  124. num *= 1e3
  125. # 英文单位
  126. elif unit.startswith("k"):
  127. num *= 1e3
  128. elif unit.startswith("m"):
  129. num *= 1e6
  130. elif unit.startswith("b"):
  131. num *= 1e9
  132. return int(num)
  133. def decode_show_k(show_k: str) -> str:
  134. """
  135. 统一 key(中英文)
  136. """
  137. if not show_k:
  138. return "show_unknown"
  139. show_k = show_k.strip().lower()
  140. mapping = {
  141. # 中文
  142. "阅读": "show_view_count",
  143. "看过": "show_view_count",
  144. "观看": "show_view_count",
  145. "赞": "show_like_count",
  146. "点赞": "show_like_count",
  147. "付费": "show_pay_count",
  148. "赞赏": "show_zs_count",
  149. # 英文
  150. "reads": "show_view_count",
  151. "views": "show_view_count",
  152. "view": "show_view_count",
  153. "likes": "show_like_count",
  154. "like": "show_like_count",
  155. "payments": "show_pay_count",
  156. "paid": "show_pay_count",
  157. }
  158. return mapping.get(show_k, "show_unknown")
  159. # ===== 主逻辑 =====
  160. if not show_desc:
  161. return {
  162. "show_view_count": 0,
  163. "show_like_count": 0,
  164. "show_pay_count": 0,
  165. "show_zs_count": 0,
  166. }
  167. # 去掉 +
  168. show_desc = show_desc.replace("+", "")
  169. sta = {}
  170. # 按“组”切分(兼容各种奇怪空格)
  171. groups = re.split(r"[\u2004\u2005]+", show_desc)
  172. for group in groups:
  173. group = group.strip()
  174. if not group:
  175. continue
  176. # 按 key-value 分隔符拆
  177. parts = group.split("\u2006")
  178. if len(parts) != 2:
  179. continue
  180. a, b = parts
  181. # 自动判断哪个是数字
  182. if re.search(r"\d", a):
  183. show_v, show_k = a, b
  184. else:
  185. show_k, show_v = a, b
  186. k = decode_show_k(show_k)
  187. v = decode_show_v(show_v)
  188. if k != "show_unknown":
  189. sta[k] = v
  190. return {
  191. "show_view_count": sta.get("show_view_count", 0),
  192. "show_like_count": sta.get("show_like_count", 0),
  193. "show_pay_count": sta.get("show_pay_count", 0),
  194. "show_zs_count": sta.get("show_zs_count", 0),
  195. }
  196. def generate_gzh_id(url):
  197. biz = url.split("biz=")[1].split("&")[0]
  198. idx = url.split("&idx=")[1].split("&")[0]
  199. sn = url.split("&sn=")[1].split("&")[0]
  200. url_bit = "{}-{}-{}".format(biz, idx, sn).encode()
  201. md5_hash = hashlib.md5()
  202. md5_hash.update(url_bit)
  203. md5_value = md5_hash.hexdigest()
  204. return md5_value
  205. def timestamp_to_str(timestamp, string_format="%Y-%m-%d %H:%M:%S") -> str:
  206. """
  207. :param string_format:
  208. :param timestamp:
  209. """
  210. dt_object = (
  211. datetime.utcfromtimestamp(timestamp).replace(tzinfo=timezone.utc).astimezone()
  212. )
  213. date_string = dt_object.strftime(string_format)
  214. return date_string
  215. def days_remaining_in_month():
  216. # 获取当前日期
  217. today = date.today()
  218. # 获取下个月的第一天
  219. if today.month == 12:
  220. next_month = today.replace(year=today.year + 1, month=1, day=1)
  221. else:
  222. next_month = today.replace(month=today.month + 1, day=1)
  223. # 计算本月最后一天(下个月第一天减去1天)
  224. last_day_of_month = next_month - timedelta(days=1)
  225. # 计算剩余天数
  226. remaining_days = (last_day_of_month - today).days
  227. return remaining_days
  228. def generate_task_trace_id():
  229. random_str = "".join(random.choices(string.ascii_lowercase + string.digits, k=16))
  230. return f"Task-{datetime.now().strftime('%Y%m%d%H%M%S')}-{random_str}"
  231. def ci_lower(data: List[int], conf: float = 0.95) -> float:
  232. """
  233. 计算data的置信区间下限
  234. """
  235. if len(data) < 2:
  236. raise ValueError("Sample length less than 2")
  237. n = len(data)
  238. mean = statistics.mean(data)
  239. std = statistics.stdev(data) / math.sqrt(n)
  240. # t 分位点(左侧):ppf 返回负值
  241. t_left = t.ppf((1 - conf) / 2, df=n - 1)
  242. return mean + t_left * std
  243. def fetch_from_odps(query):
  244. client = ODPS(
  245. access_id="LTAIWYUujJAm7CbH",
  246. secret_access_key="RfSjdiWwED1sGFlsjXv0DlfTnZTG1P",
  247. endpoint="http://service.cn.maxcompute.aliyun.com/api",
  248. project="loghubods",
  249. )
  250. with client.execute_sql(query).open_reader() as reader:
  251. if reader:
  252. return [item for item in reader]
  253. else:
  254. return []
  255. def init_odps_client():
  256. return ODPS(
  257. access_id="LTAIWYUujJAm7CbH",
  258. secret_access_key="RfSjdiWwED1sGFlsjXv0DlfTnZTG1P",
  259. endpoint="http://service.cn.maxcompute.aliyun.com/api",
  260. project="loghubods",
  261. )
  262. def upload_to_oss(local_video_path, oss_key):
  263. """
  264. 把视频上传到 oss
  265. :return:
  266. """
  267. access_key_id = "LTAIP6x1l3DXfSxm"
  268. access_key_secret = "KbTaM9ars4OX3PMS6Xm7rtxGr1FLon"
  269. endpoint = "oss-cn-hangzhou.aliyuncs.com"
  270. bucket_name = "art-pubbucket"
  271. bucket = oss2.Bucket(
  272. oss2.Auth(access_key_id, access_key_secret), endpoint, bucket_name
  273. )
  274. bucket.put_object_from_file(key=oss_key, filename=local_video_path)
  275. return oss_key