123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869 |
- import json
- import os
- import sys
- import argparse
- sys.path.append(os.getcwd())
- from application.common.mysql import MysqlHelper
- from spider.spider_map import spider_map
- class OnlineManager(object):
- """
- 线上爬虫模版
- Todo: 加上阿里云日志;
- """
- def __init__(self, task_id, mode, platform):
- self.env = "prod"
- self.task_id = task_id
- self.mode = mode
- self.platform = platform
- self.MySQL = MysqlHelper(mode=self.mode, platform=self.platform, env=self.env)
- def get_task_rule(self):
- """
- :return: 返回任务的规则, task_rule
- """
- rule_dict = {}
- task_rule_sql = f"SELECT rule FROM crawler_task_v3 WHERE id = {self.task_id};"
- data = self.MySQL.select(task_rule_sql)
- if data:
- rule_list = json.loads(data[0][0])
- for item in rule_list:
- for key in item:
- rule_dict[key] = item[key]
- return rule_dict
- def get_task_user_list(self):
- """
- :return: 返回用户列表
- """
- task_user_list_sql = f"SELECT uid, link from crawler_user_v3 where task_id = {self.task_id};"
- uid_list = self.MySQL.select(task_user_list_sql)
- user_list = [{"uid": i[0], "link": i[1]} for i in uid_list] if uid_list else []
- return user_list
- def start_crawl(self):
- rule_dict = self.get_task_rule()
- user_list = self.get_task_user_list()
- if rule_dict and user_list:
- spider_class = spider_map[self.platform][self.mode]
- main_process = spider_class(
- platform=self.platform,
- mode=self.mode,
- rule_dict=rule_dict,
- user_list=user_list,
- env=self.env
- )
- main_process.run()
- if __name__ == "__main__":
- parser = argparse.ArgumentParser() ## 新建参数解释器对象
- parser.add_argument("--task_id")
- parser.add_argument("--mode")
- parser.add_argument("--platform")
- args = parser.parse_args()
- M = OnlineManager(task_id=args.task_id, mode=args.mode, platform=args.platform)
- M.start_crawl()
|