123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254 |
- """
- @author: luojunhui
- """
- import random
- import string
- import hashlib
- import math
- import statistics
- from scipy.stats import t
- from datetime import datetime, timezone, date, timedelta
- from typing import List
- from requests import RequestException
- from urllib.parse import urlparse, parse_qs
- from tenacity import (
- stop_after_attempt,
- wait_exponential,
- retry_if_exception_type,
- )
- from applications.config import name_map
- def str_to_md5(strings):
- """
- 字符串转化为 md5 值
- :param strings:
- :return:
- """
- # 将字符串转换为字节
- original_bytes = strings.encode("utf-8")
- # 创建一个md5 hash对象
- md5_hash = hashlib.md5()
- # 更新hash对象,传入原始字节
- md5_hash.update(original_bytes)
- # 获取16进制形式的MD5哈希值
- md5_value = md5_hash.hexdigest()
- return md5_value
- def proxy():
- """
- 快代理
- """
- # 隧道域名:端口号
- tunnel = "j685.kdltps.com:15818"
- # 用户名密码方式
- username = "t14070979713487"
- password = "hqwanfvy"
- proxies = {
- "http": "http://%(user)s:%(pwd)s@%(proxy)s/"
- % {"user": username, "pwd": password, "proxy": tunnel},
- "https": "http://%(user)s:%(pwd)s@%(proxy)s/"
- % {"user": username, "pwd": password, "proxy": tunnel},
- }
- return proxies
- def async_proxy():
- return {
- "url": "http://j685.kdltps.com:15818",
- "username": "t14070979713487",
- "password": "hqwanfvy",
- }
- def request_retry(retry_times, min_retry_delay, max_retry_delay):
- """
- :param retry_times:
- :param min_retry_delay:
- :param max_retry_delay:
- """
- common_retry = dict(
- stop=stop_after_attempt(retry_times),
- wait=wait_exponential(min=min_retry_delay, max=max_retry_delay),
- retry=retry_if_exception_type((RequestException, TimeoutError)),
- reraise=True, # 重试耗尽后重新抛出异常
- )
- return common_retry
- def yield_batch(data, batch_size):
- """
- 生成批次数据
- :param data:
- :param batch_size:
- :return:
- """
- for i in range(0, len(data), batch_size):
- yield data[i : i + batch_size]
- def extract_root_source_id(path: str) -> dict:
- """
- 提取path参数
- :param path:
- :return:
- """
- params = parse_qs(urlparse(path).query)
- jump_page = params.get("jumpPage", [None])[0]
- if jump_page:
- params2 = parse_qs(jump_page)
- res = {
- "video_id": params2["pages/user-videos?id"][0],
- "root_source_id": params2["rootSourceId"][0],
- }
- return res
- else:
- return {}
- def show_desc_to_sta(show_desc):
- def decode_show_v(show_v):
- """
- :param show_v:
- :return:
- """
- foo = show_v.replace("千", "e3").replace("万", "e4").replace("亿", "e8")
- foo = eval(foo)
- return int(foo)
- def decode_show_k(show_k):
- """
- :param show_k:
- :return:
- """
- this_dict = {
- "阅读": "show_view_count", # 文章
- "看过": "show_view_count", # 图文
- "观看": "show_view_count", # 视频
- "赞": "show_like_count",
- "付费": "show_pay_count",
- "赞赏": "show_zs_count",
- }
- if show_k not in this_dict:
- print(f"error from decode_show_k, show_k not found: {show_k}")
- return this_dict.get(show_k, "show_unknown")
- show_desc = show_desc.replace("+", "")
- sta = {}
- for show_kv in show_desc.split("\u2004\u2005"):
- if not show_kv:
- continue
- show_k, show_v = show_kv.split("\u2006")
- k = decode_show_k(show_k)
- v = decode_show_v(show_v)
- sta[k] = v
- res = {
- "show_view_count": sta.get("show_view_count", 0),
- "show_like_count": sta.get("show_like_count", 0),
- "show_pay_count": sta.get("show_pay_count", 0),
- "show_zs_count": sta.get("show_zs_count", 0),
- }
- return res
- def generate_gzh_id(url):
- biz = url.split("biz=")[1].split("&")[0]
- idx = url.split("&idx=")[1].split("&")[0]
- sn = url.split("&sn=")[1].split("&")[0]
- url_bit = "{}-{}-{}".format(biz, idx, sn).encode()
- md5_hash = hashlib.md5()
- md5_hash.update(url_bit)
- md5_value = md5_hash.hexdigest()
- return md5_value
- def timestamp_to_str(timestamp, string_format="%Y-%m-%d %H:%M:%S") -> str:
- """
- :param string_format:
- :param timestamp:
- """
- dt_object = (
- datetime.utcfromtimestamp(timestamp).replace(tzinfo=timezone.utc).astimezone()
- )
- date_string = dt_object.strftime(string_format)
- return date_string
- def days_remaining_in_month():
- # 获取当前日期
- today = date.today()
- # 获取下个月的第一天
- if today.month == 12:
- next_month = today.replace(year=today.year + 1, month=1, day=1)
- else:
- next_month = today.replace(month=today.month + 1, day=1)
- # 计算本月最后一天(下个月第一天减去1天)
- last_day_of_month = next_month - timedelta(days=1)
- # 计算剩余天数
- remaining_days = (last_day_of_month - today).days
- return remaining_days
- def generate_task_trace_id():
- random_str = "".join(random.choices(string.ascii_lowercase + string.digits, k=16))
- return f"Task-{datetime.now().strftime('%Y%m%d%H%M%S')}-{random_str}"
- def ci_lower(data: List[int], conf: float = 0.95) -> float:
- """
- 计算data的置信区间下限
- """
- if len(data) < 2:
- raise ValueError("Sample length less than 2")
- n = len(data)
- mean = statistics.mean(data)
- std = statistics.stdev(data) / math.sqrt(n)
- # t 分位点(左侧):ppf 返回负值
- t_left = t.ppf((1 - conf) / 2, df=n - 1)
- return mean + t_left * std
- def get_task_chinese_name(data):
- """
- 通过输入任务详情信息获取任务名称
- """
- task_name = data['task_name']
- task_name_chinese = name_map.get(task_name, task_name)
- # account_method
- if task_name == 'crawler_gzh_articles':
- account_method = data.get('account_method', '')
- account_method = account_method.replace("account_association", "账号联想").replace("search", "")
- crawl_mode = data.get('crawl_mode', '')
- crawl_mode = crawl_mode.replace("search", "搜索").replace("account", "抓账号")
- strategy = data.get('strategy', '')
- return f"{task_name_chinese}\t{crawl_mode}\t{account_method}\t{strategy}"
- elif task_name == 'article_pool_cold_start':
- platform = data.get('platform')
- platform = platform.replace('toutiao', '今日头条').replace("weixin", "微信")
- strategy = data.get('strategy', '')
- strategy = strategy.replace("strategy", "策略")
- category_list = data.get('category_list', [])
- category_list = "、".join(category_list)
- crawler_methods = data.get('crawler_methods', [])
- crawler_methods = "、".join(crawler_methods)
- return f"{task_name_chinese}\t{platform}\t{crawler_methods}\t{category_list}\t{strategy}"
- else:
- return task_name_chinese
|