common.py 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124
  1. """
  2. @author: luojunhui
  3. """
  4. import hashlib
  5. from requests import RequestException
  6. from tenacity import (
  7. stop_after_attempt,
  8. wait_exponential,
  9. retry_if_exception_type,
  10. )
  11. def str_to_md5(strings):
  12. """
  13. 字符串转化为 md5 值
  14. :param strings:
  15. :return:
  16. """
  17. # 将字符串转换为字节
  18. original_bytes = strings.encode("utf-8")
  19. # 创建一个md5 hash对象
  20. md5_hash = hashlib.md5()
  21. # 更新hash对象,传入原始字节
  22. md5_hash.update(original_bytes)
  23. # 获取16进制形式的MD5哈希值
  24. md5_value = md5_hash.hexdigest()
  25. return md5_value
  26. def proxy():
  27. """
  28. 快代理
  29. """
  30. # 隧道域名:端口号
  31. tunnel = "j685.kdltps.com:15818"
  32. # 用户名密码方式
  33. username = "t14070979713487"
  34. password = "hqwanfvy"
  35. proxies = {
  36. "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
  37. "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
  38. }
  39. return proxies
  40. def request_retry(retry_times, min_retry_delay, max_retry_delay):
  41. """
  42. :param retry_times:
  43. :param min_retry_delay:
  44. :param max_retry_delay:
  45. """
  46. common_retry = dict(
  47. stop=stop_after_attempt(retry_times),
  48. wait=wait_exponential(min=min_retry_delay, max=max_retry_delay),
  49. retry=retry_if_exception_type((RequestException, TimeoutError)),
  50. reraise=True # 重试耗尽后重新抛出异常
  51. )
  52. return common_retry
  53. def yield_batch(data, batch_size):
  54. """
  55. 生成批次数据
  56. :param data:
  57. :param batch_size:
  58. :return:
  59. """
  60. for i in range(0, len(data), batch_size):
  61. yield data[i:i + batch_size]
  62. def show_desc_to_sta(show_desc):
  63. """
  64. :return:
  65. """
  66. def decode_show_v(show_v):
  67. """
  68. :param show_v:
  69. :return:
  70. """
  71. foo = show_v.replace('千', 'e3').replace('万', 'e4').replace('亿', 'e8')
  72. foo = eval(foo)
  73. return int(foo)
  74. def decode_show_k(show_k):
  75. """
  76. :param show_k:
  77. :return:
  78. """
  79. this_dict = {
  80. '阅读': 'show_view_count', # 文章
  81. '看过': 'show_view_count', # 图文
  82. '观看': 'show_view_count', # 视频
  83. '赞': 'show_like_count',
  84. '付费': 'show_pay_count',
  85. '赞赏': 'show_zs_count',
  86. }
  87. if show_k not in this_dict:
  88. print(f'error from decode_show_k, show_k not found: {show_k}')
  89. return this_dict.get(show_k, 'show_unknown')
  90. show_desc = show_desc.replace('+', '')
  91. sta = {}
  92. for show_kv in show_desc.split('\u2004\u2005'):
  93. if not show_kv:
  94. continue
  95. show_k, show_v = show_kv.split('\u2006')
  96. k = decode_show_k(show_k)
  97. v = decode_show_v(show_v)
  98. sta[k] = v
  99. res = {
  100. 'show_view_count': sta.get('show_view_count', 0),
  101. 'show_like_count': sta.get('show_like_count', 0),
  102. 'show_pay_count': sta.get('show_pay_count', 0),
  103. 'show_zs_count': sta.get('show_zs_count', 0),
  104. }
  105. return res