소스 검색

main.py 修改部分日志;
run_spider_online.py 线上爬虫模版已经完成;

罗俊辉 1 년 전
부모
커밋
6bdb6bb62e
2개의 변경된 파일31개의 추가작업 그리고 13개의 파일을 삭제
  1. 1 1
      app/main.py
  2. 30 12
      scheduler/run_spider_online.py

+ 1 - 1
app/main.py

@@ -25,7 +25,7 @@ async def run(task_id, mode, platform):
     """
     # 创建一个aliyun日志对象
     logger = AliyunLogger(platform=platform, mode=mode)
-    logger.logging(code=1003, message="{}: 开始一轮抓取".format(platform))
+    logger.logging(code=1005, message="{}: 启动进程".format(platform))
     # 创建并一个子进程
     await asyncio.create_subprocess_shell(
         "python3 scheduler/run_spider_online.py --task_id {} --mode {} --platform {}".format(

+ 30 - 12
scheduler/run_spider_online.py

@@ -5,14 +5,13 @@ import argparse
 
 sys.path.append(os.getcwd())
 
-from application.common.mysql import MysqlHelper
+from application.common import MysqlHelper, AliyunLogger
 from spider.spider_map import spider_map
 
 
 class OnlineManager(object):
     """
     线上爬虫模版
-    Todo: 加上阿里云日志;
     """
     def __init__(self, task_id, mode, platform):
         self.env = "prod"
@@ -20,6 +19,7 @@ class OnlineManager(object):
         self.mode = mode
         self.platform = platform
         self.MySQL = MysqlHelper(mode=self.mode, platform=self.platform, env=self.env)
+        self.logger = AliyunLogger(platform=self.platform, mode=mode)
 
     def get_task_rule(self):
         """
@@ -33,6 +33,11 @@ class OnlineManager(object):
             for item in rule_list:
                 for key in item:
                     rule_dict[key] = item[key]
+        self.logger.logging(
+            code=1000,
+            message="抓取规则",
+            data=rule_dict
+        )
         return rule_dict
 
     def get_task_user_list(self):
@@ -42,25 +47,38 @@ class OnlineManager(object):
         task_user_list_sql = f"SELECT uid, link from crawler_user_v3 where task_id = {self.task_id};"
         uid_list = self.MySQL.select(task_user_list_sql)
         user_list = [{"uid": i[0], "link": i[1]} for i in uid_list] if uid_list else []
+        self.logger.logging(
+            code=1000,
+            message="用户列表",
+            data=user_list
+        )
         return user_list
 
     def start_crawl(self):
+        """
+        :return: 爬虫启动脚本
+        """
         rule_dict = self.get_task_rule()
         user_list = self.get_task_user_list()
         if rule_dict and user_list:
-            spider_class = spider_map[self.platform][self.mode]
-            main_process = spider_class(
-                platform=self.platform,
-                mode=self.mode,
-                rule_dict=rule_dict,
-                user_list=user_list,
-                env=self.env
-            )
-            main_process.run()
+            try:
+                spider_class = spider_map[self.platform][self.mode]
+                self.logger.logging(code=1003, message="开始一轮抓取")
+                main_process = spider_class(
+                    platform=self.platform,
+                    mode=self.mode,
+                    rule_dict=rule_dict,
+                    user_list=user_list,
+                    env=self.env
+                )
+                main_process.run()
+                self.logger.logging(code=1004, message="完成一轮抓取")
+            except Exception as e:
+                self.logger.logging(code=1006, message="启动爬虫出现错误, 报错原因是: {}".format(e))
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser()  ## 新建参数解释器对象
+    parser = argparse.ArgumentParser()  # 新建参数解释器对象
     parser.add_argument("--task_id")
     parser.add_argument("--mode")
     parser.add_argument("--platform")