123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124 |
- """
- @author: luojunhui
- """
- import hashlib
- from requests import RequestException
- from tenacity import (
- stop_after_attempt,
- wait_exponential,
- retry_if_exception_type,
- )
- def str_to_md5(strings):
- """
- 字符串转化为 md5 值
- :param strings:
- :return:
- """
- # 将字符串转换为字节
- original_bytes = strings.encode("utf-8")
- # 创建一个md5 hash对象
- md5_hash = hashlib.md5()
- # 更新hash对象,传入原始字节
- md5_hash.update(original_bytes)
- # 获取16进制形式的MD5哈希值
- md5_value = md5_hash.hexdigest()
- return md5_value
- def proxy():
- """
- 快代理
- """
- # 隧道域名:端口号
- tunnel = "j685.kdltps.com:15818"
- # 用户名密码方式
- username = "t14070979713487"
- password = "hqwanfvy"
- proxies = {
- "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
- "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
- }
- return proxies
- def request_retry(retry_times, min_retry_delay, max_retry_delay):
- """
- :param retry_times:
- :param min_retry_delay:
- :param max_retry_delay:
- """
- common_retry = dict(
- stop=stop_after_attempt(retry_times),
- wait=wait_exponential(min=min_retry_delay, max=max_retry_delay),
- retry=retry_if_exception_type((RequestException, TimeoutError)),
- reraise=True # 重试耗尽后重新抛出异常
- )
- return common_retry
- def yield_batch(data, batch_size):
- """
- 生成批次数据
- :param data:
- :param batch_size:
- :return:
- """
- for i in range(0, len(data), batch_size):
- yield data[i:i + batch_size]
- def show_desc_to_sta(show_desc):
- """
- :return:
- """
- def decode_show_v(show_v):
- """
- :param show_v:
- :return:
- """
- foo = show_v.replace('千', 'e3').replace('万', 'e4').replace('亿', 'e8')
- foo = eval(foo)
- return int(foo)
- def decode_show_k(show_k):
- """
- :param show_k:
- :return:
- """
- this_dict = {
- '阅读': 'show_view_count', # 文章
- '看过': 'show_view_count', # 图文
- '观看': 'show_view_count', # 视频
- '赞': 'show_like_count',
- '付费': 'show_pay_count',
- '赞赏': 'show_zs_count',
- }
- if show_k not in this_dict:
- print(f'error from decode_show_k, show_k not found: {show_k}')
- return this_dict.get(show_k, 'show_unknown')
- show_desc = show_desc.replace('+', '')
- sta = {}
- for show_kv in show_desc.split('\u2004\u2005'):
- if not show_kv:
- continue
- show_k, show_v = show_kv.split('\u2006')
- k = decode_show_k(show_k)
- v = decode_show_v(show_v)
- sta[k] = v
- res = {
- 'show_view_count': sta.get('show_view_count', 0),
- 'show_like_count': sta.get('show_like_count', 0),
- 'show_pay_count': sta.get('show_pay_count', 0),
- 'show_zs_count': sta.get('show_zs_count', 0),
- }
- return res
|