common.py 2.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394
  1. """
  2. @author: luojunhui
  3. """
  4. import hashlib
  5. from requests import RequestException
  6. from urllib.parse import urlparse, parse_qs
  7. from tenacity import (
  8. stop_after_attempt,
  9. wait_exponential,
  10. retry_if_exception_type,
  11. )
  12. def str_to_md5(strings):
  13. """
  14. 字符串转化为 md5 值
  15. :param strings:
  16. :return:
  17. """
  18. # 将字符串转换为字节
  19. original_bytes = strings.encode("utf-8")
  20. # 创建一个md5 hash对象
  21. md5_hash = hashlib.md5()
  22. # 更新hash对象,传入原始字节
  23. md5_hash.update(original_bytes)
  24. # 获取16进制形式的MD5哈希值
  25. md5_value = md5_hash.hexdigest()
  26. return md5_value
  27. def proxy():
  28. """
  29. 快代理
  30. """
  31. # 隧道域名:端口号
  32. tunnel = "j685.kdltps.com:15818"
  33. # 用户名密码方式
  34. username = "t14070979713487"
  35. password = "hqwanfvy"
  36. proxies = {
  37. "http": "http://%(user)s:%(pwd)s@%(proxy)s/"
  38. % {"user": username, "pwd": password, "proxy": tunnel},
  39. "https": "http://%(user)s:%(pwd)s@%(proxy)s/"
  40. % {"user": username, "pwd": password, "proxy": tunnel},
  41. }
  42. return proxies
  43. def request_retry(retry_times, min_retry_delay, max_retry_delay):
  44. """
  45. :param retry_times:
  46. :param min_retry_delay:
  47. :param max_retry_delay:
  48. """
  49. common_retry = dict(
  50. stop=stop_after_attempt(retry_times),
  51. wait=wait_exponential(min=min_retry_delay, max=max_retry_delay),
  52. retry=retry_if_exception_type((RequestException, TimeoutError)),
  53. reraise=True, # 重试耗尽后重新抛出异常
  54. )
  55. return common_retry
  56. def yield_batch(data, batch_size):
  57. """
  58. 生成批次数据
  59. :param data:
  60. :param batch_size:
  61. :return:
  62. """
  63. for i in range(0, len(data), batch_size):
  64. yield data[i : i + batch_size]
  65. def extract_root_source_id(path: str) -> dict:
  66. """
  67. 提取path参数
  68. :param path:
  69. :return:
  70. """
  71. params = parse_qs(urlparse(path).query)
  72. jump_page = params.get("jumpPage", [None])[0]
  73. if jump_page:
  74. params2 = parse_qs(jump_page)
  75. res = {
  76. "video_id": params2["pages/user-videos?id"][0],
  77. "root_source_id": params2["rootSourceId"][0],
  78. }
  79. return res
  80. else:
  81. return {}