Server
/
piaoquan_crawler


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140
							# -*- coding: utf-8 -*-
# @Author: wangkun
# @Time: 2023/3/27
import os, sys
import time
import random
sys.path.append(os.getcwd())
from common.common import Common
from common.scheduling_db import MysqlHelper
# from common import Common
# from scheduling_db import MysqlHelper


def get_user_from_mysql(log_type, crawler, source, env, action=''):
    sql = f"select * from crawler_user_v3 where source='{source}' and mode='{log_type}'"
    results = MysqlHelper.get_values(log_type, crawler, sql, env, action=action)
    if results:
        return results
    else:
        Common.logger(log_type, crawler).warning(f"爬虫:{crawler},没有查到抓取名单")
        return []


def get_config_from_mysql(log_type, source, env, text, action=''):
    select_sql = f"""select * from crawler_config where source="{source}" """
    contents = MysqlHelper.get_values(log_type, source, select_sql, env, action=action)
    title_list = []
    filter_list = []
    emoji_list = []
    search_word_list = []
    for content in contents:
        config = content['config']
        config_dict = eval(config)
        for k, v in config_dict.items():
            if k == "title":
                title_list_config = v.split(",")
                for title in title_list_config:
                    title_list.append(title)
            if k == "filter":
                filter_list_config = v.split(",")
                for filter_word in filter_list_config:
                    filter_list.append(filter_word)
            if k == "emoji":
                emoji_list_config = v.split(",")
                for emoji in emoji_list_config:
                    emoji_list.append(emoji)
            if k == "search_word":
                search_word_list_config = v.split(",")
                for search_word in search_word_list_config:
                    search_word_list.append(search_word)
    if text == "title":
        return title_list
    elif text == "filter":
        return filter_list
    elif text == "emoji":
        return emoji_list
    elif text == "search_word":
        return search_word_list

def random_title(log_type, crawler, env, text):
    random_title_list = get_config_from_mysql(log_type, crawler, env, text)
    return random.choice(random_title_list)


def task_fun(task_str):
    task_str = task_str.replace("'[", '[').replace("]'", ']')
    task_dict = dict(eval(task_str))
    rule = task_dict['rule']
    task_dict['rule'] = dict()
    for item in rule:
        for k, val in item.items():
            task_dict['rule'][k] = val
    rule_dict = task_dict['rule']
    task_dict = {
        "task_dict": task_dict,
        "rule_dict": rule_dict
    }
    return task_dict


def download_rule(log_type, crawler, video_dict, rule_dict):
    """
    下载视频的基本规则
    :param log_type: 日志
    :param crawler: 哪款爬虫
    :param video_dict: 视频信息，字典格式
    :param rule_dict: 规则信息，字典格式
    :return: 满足规则，返回 True；反之，返回 False
    """
    # 格式化 video_dict:publish_time_stamp
    if "publish_time_stamp" in video_dict.keys():
        video_dict["publish_time"] = video_dict["publish_time_stamp"] * 1000
    # 格式化 video_dict:period
    if "period" not in video_dict.keys() and "publish_time" in video_dict.keys():
        video_dict["period"] = int((int(time.time() * 1000) - video_dict["publish_time"]) / (3600 * 24 * 1000))
    # 格式化 rule_dict 最大值取值为 0 的问题
    for rule_value in rule_dict.values():
        if rule_value["max"] == 0:
            rule_value["max"] = 999999999999999
    # 格式化 rule_dict 有的 key，video_dict 中没有的问题
    for rule_key in rule_dict.keys():
        if rule_key not in video_dict.keys():
            video_dict[rule_key] = int(rule_dict[rule_key]["max"] / 2)
    # 比较结果，输出：True / False
    for video_key, video_value in video_dict.items():
        for rule_key, rule_value in rule_dict.items():
            if video_key == rule_key == "period":
                result = 0 <= int(video_value) <= int(rule_value["max"])
                Common.logger(log_type, crawler).info(f'{video_key}: 0 <= {video_value} <= {rule_value["min"]}, {result}')
            elif video_key == rule_key:
                result = int(rule_value["min"]) <= int(video_value) <= int(rule_value["max"])
                Common.logger(log_type, crawler).info(f'{video_key}: {rule_value["min"]} <= {video_value} <= {rule_value["max"]}，{result}')
            else:
                result = True

            if result is False:
                return False
            else:
                continue
    return True


if __name__ == "__main__":
    # print(filter_word('public', 'xiaoniangao', '小年糕', 'prod'))
    # print(get_config_from_mysql('test', 'gongzhonghao', 'prod', 'filter'))
    # print(filter_word('test', 'gongzhonghao', '公众号', 'prod'))
    # task_str = "[('task_id','11')," \
    #            "('task_name','小年糕小时榜')," \
    #            "('source','xiaoniangao')," \
    #            "('start_time','1681834560000')," \
    #            "('interval','1'),('mode','hour')," \
    #            "('rule','[{'duration':{'min':40,'max':0}},{'playCnt':{'min':4000,'max':0}},{'period':{'min':10,'max':0}},{'fans':{'min':0,'max':0}},{'videos':{'min':0,'max':0}},{'like':{'min':0,'max':0}},{'videoWidth':{'min':0,'max':0}},{'videoHeight':{'min':0,'max':0}}]')," \
    #            "('spider_name','')," \
    #            "('machine','')," \
    #            "('status','0')," \
    #            "('create_time','1681889875288')," \
    #            "('update_time','1681889904908')," \
    #            "('operator','王坤')]"
    # print(task(task_str))
    pass