|
@@ -5,14 +5,13 @@ import argparse
|
|
|
|
|
|
sys.path.append(os.getcwd())
|
|
|
|
|
|
-from application.common.mysql import MysqlHelper
|
|
|
+from application.common import MysqlHelper, AliyunLogger
|
|
|
from spider.spider_map import spider_map
|
|
|
|
|
|
|
|
|
class OnlineManager(object):
|
|
|
"""
|
|
|
线上爬虫模版
|
|
|
- Todo: 加上阿里云日志;
|
|
|
"""
|
|
|
def __init__(self, task_id, mode, platform):
|
|
|
self.env = "prod"
|
|
@@ -20,6 +19,7 @@ class OnlineManager(object):
|
|
|
self.mode = mode
|
|
|
self.platform = platform
|
|
|
self.MySQL = MysqlHelper(mode=self.mode, platform=self.platform, env=self.env)
|
|
|
+ self.logger = AliyunLogger(platform=self.platform, mode=mode)
|
|
|
|
|
|
def get_task_rule(self):
|
|
|
"""
|
|
@@ -33,6 +33,11 @@ class OnlineManager(object):
|
|
|
for item in rule_list:
|
|
|
for key in item:
|
|
|
rule_dict[key] = item[key]
|
|
|
+ self.logger.logging(
|
|
|
+ code=1000,
|
|
|
+ message="抓取规则",
|
|
|
+ data=rule_dict
|
|
|
+ )
|
|
|
return rule_dict
|
|
|
|
|
|
def get_task_user_list(self):
|
|
@@ -42,25 +47,38 @@ class OnlineManager(object):
|
|
|
task_user_list_sql = f"SELECT uid, link from crawler_user_v3 where task_id = {self.task_id};"
|
|
|
uid_list = self.MySQL.select(task_user_list_sql)
|
|
|
user_list = [{"uid": i[0], "link": i[1]} for i in uid_list] if uid_list else []
|
|
|
+ self.logger.logging(
|
|
|
+ code=1000,
|
|
|
+ message="用户列表",
|
|
|
+ data=user_list
|
|
|
+ )
|
|
|
return user_list
|
|
|
|
|
|
def start_crawl(self):
|
|
|
+ """
|
|
|
+ :return: 爬虫启动脚本
|
|
|
+ """
|
|
|
rule_dict = self.get_task_rule()
|
|
|
user_list = self.get_task_user_list()
|
|
|
if rule_dict and user_list:
|
|
|
- spider_class = spider_map[self.platform][self.mode]
|
|
|
- main_process = spider_class(
|
|
|
- platform=self.platform,
|
|
|
- mode=self.mode,
|
|
|
- rule_dict=rule_dict,
|
|
|
- user_list=user_list,
|
|
|
- env=self.env
|
|
|
- )
|
|
|
- main_process.run()
|
|
|
+ try:
|
|
|
+ spider_class = spider_map[self.platform][self.mode]
|
|
|
+ self.logger.logging(code=1003, message="开始一轮抓取")
|
|
|
+ main_process = spider_class(
|
|
|
+ platform=self.platform,
|
|
|
+ mode=self.mode,
|
|
|
+ rule_dict=rule_dict,
|
|
|
+ user_list=user_list,
|
|
|
+ env=self.env
|
|
|
+ )
|
|
|
+ main_process.run()
|
|
|
+ self.logger.logging(code=1004, message="完成一轮抓取")
|
|
|
+ except Exception as e:
|
|
|
+ self.logger.logging(code=1006, message="启动爬虫出现错误, 报错原因是: {}".format(e))
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
- parser = argparse.ArgumentParser() ## 新建参数解释器对象
|
|
|
+ parser = argparse.ArgumentParser() # 新建参数解释器对象
|
|
|
parser.add_argument("--task_id")
|
|
|
parser.add_argument("--mode")
|
|
|
parser.add_argument("--platform")
|