Server
/
piaoquan_crawler


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128
							# -*- coding: utf-8 -*-
# @Author: wangkun
# @Time: 2023/3/2
import os
import sys
import time

sys.path.append(os.getcwd())
from common.common import Common
from common.scheduling_db import MysqlHelper, RedisHelper


class Scheduling:
    # 读取任务表
    @classmethod
    def get_task(cls, log_type, crawler, env):
        get_sql = """ select * from crawler_task """
        all_task_list = MysqlHelper.get_values(log_type=log_type, crawler=crawler, sql=get_sql, env=env)
        pre_task_list = []
        for task in all_task_list:
            if int(time.time()) >= task["next_time"]:
                pre_task_list.append(task)
        return pre_task_list

    # 更新下次启动时间，调用时机：调度该 task_id 的任务时
    @classmethod
    def update_task(cls, log_type, crawler, task_id, next_time, interval_piaoquan, env):
        if interval_piaoquan > 0:
            new_next_time = next_time + interval_piaoquan
            update_sql = f""" UPDATE crawler_task SET next_time={new_next_time} WHERE task_id={task_id} """
            MysqlHelper.update_values(log_type, crawler, update_sql, env)

    # 资源分配 / 组装
    @classmethod
    def write_redis(cls, log_type, crawler, env):
        pre_task_list = cls.get_task(log_type=log_type, crawler=crawler, env=env)
        if len(pre_task_list) == 0:
            Common.logger(log_type, crawler).info("暂无新任务\n")
        else:
            for pre_task in pre_task_list:
                # machine字段是用来区分海外爬虫和国内爬虫使用的，不涉及任何其他含义
                machine = pre_task.get('machine', 'dev')
                next_time = pre_task['next_time']
                interval_piaoquan = pre_task['interval_piaoquan']
                task_id = pre_task['task_id']
                if machine == "hk":
                    # 写入 redis
                    task_key = 'crawler_config_task_queue:hk'
                    RedisHelper.redis_push(env, task_key, str(pre_task))
                elif machine == "aliyun":
                    # 写入 redis
                    task_key = 'crawler_config_task_queue:aliyun'
                    RedisHelper.redis_push(env, task_key, str(pre_task))
                else:
                    # 写入 redis
                    task_key = 'crawler_config_task_queue:dev'
                    RedisHelper.redis_push(env, task_key, str(pre_task))
                if int(time.time()) >= next_time:
                    cls.update_task(log_type, crawler, task_id, next_time, interval_piaoquan, env)

    @classmethod
    def get_redis(cls, log_type, crawler, env):
        if env == 'hk':
            task_key = 'crawler_config_task_queue:hk'
        elif env == 'prod':
            task_key = 'crawler_config_task_queue:aliyun'
        else:
            task_key = 'crawler_config_task_queue:dev'

        redis_data = RedisHelper.redis_pop(env, task_key)
        if redis_data is None or len(redis_data) == 0:
            # Common.logger(log_type, crawler).info("Redis为空，程序退出")
            # time.sleep(1)
            return
        else:
            task = eval(str(redis_data, encoding="utf8"))
            return task

    @classmethod
    def scheduling_task(cls, log_type, crawler, env):
        task = cls.get_redis(log_type, crawler, env)
        if not task:
            Common.logger(log_type, crawler).info("Redis为空，程序退出")
            return
        Common.logger(log_type, crawler).info(f"task: {task}")
        Common.logger(log_type, crawler).info(f"已获取调度任务:{task}")
        task_id = task['task_id']
        source = task['source']

        spider_name = task['spider_name']
        if env == "aliyun":
            oss_endpoint = "inner"
        elif env == "hk":
            oss_endpoint = "hk"
        else:
            oss_endpoint = "out"

        # 正式环境，调度任务
        Common.logger(log_type, crawler).info(f"开始调度任务:{task}\n")
        task_str = [('task_id', str(task_id)), ('task_name', str(task['task_name'])),
                    ('source', str(task['source'])), ('next_time', str(task['next_time'])),
                    ('interval_piaoquan', str(task['interval_piaoquan'])),
                    ('play_cnt', eval(task['spider_rule'])['play_cnt']),
                    ('video_width', eval(task['spider_rule'])['video_width']),
                    ('video_height', eval(task['spider_rule'])['video_height']),
                    ('video_like', eval(task['spider_rule'])['video_like']),
                    ('share_cnt', eval(task['spider_rule'])['share_cnt']),
                    ('duration_min', eval(task['spider_rule'])['duration']['min']),
                    ('duration_max', eval(task['spider_rule'])['duration']['max']),
                    ('task_type', task['task_type']), ('spider_link', eval(task['spider_link'])),
                    ('spider_name', str(task['spider_name'])), ('min_publish_time', str(task['min_publish_time'])),
                    ('min_publish_day', str(task['min_publish_day'])), ('media_id', str(task['media_id'])),
                    ('applets_status', str(task['applets_status'])), ('app_status', str(task['app_status'])),
                    ('user_tag', str(task['user_tag'])), ('user_content_tag', str(task['user_content_tag'])),
                    ('machine', str(task['machine']))]
        task_str = str(task_str).replace(' ', '')
        cmd = f"""sh scheduling/scheduling_main/scheduling.sh {source}/{source}_main/{spider_name}.py --log_type="{spider_name}" --crawler="{source}" --task="{str(task_str)}" --oss_endpoint="{oss_endpoint}" --env="{env}" {source}/{source}-nohup.log """
        Common.logger(log_type, crawler).info(f"cmd:{cmd}\n")
        os.system(cmd)


if __name__ == "__main__":
    # print(Scheduling.get_task("scheduling", "scheduling", "dev", "local"))
    # print(Scheduling.get_redis("scheduling", "scheduling", "dev", "local"))
    # Scheduling.write_redis("scheduling", "scheduling", "dev", "local")
    Scheduling.scheduling_task("scheduling", "scheduling", "dev")

    pass