run_spider_online.py 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687
  1. import json
  2. import os
  3. import sys
  4. import argparse
  5. sys.path.append(os.getcwd())
  6. from application.common import MysqlHelper, AliyunLogger
  7. from spider.spider_map import spider_map
  8. class OnlineManager(object):
  9. """
  10. 线上爬虫模版
  11. """
  12. def __init__(self, task_id, mode, platform):
  13. self.env = "prod"
  14. self.task_id = task_id
  15. self.mode = mode
  16. self.platform = platform
  17. self.MySQL = MysqlHelper(mode=self.mode, platform=self.platform, env=self.env)
  18. self.logger = AliyunLogger(platform=self.platform, mode=mode)
  19. def get_task_rule(self):
  20. """
  21. :return: 返回任务的规则, task_rule
  22. """
  23. rule_dict = {}
  24. task_rule_sql = f"SELECT rule FROM crawler_task_v3 WHERE id = {self.task_id};"
  25. data = self.MySQL.select(task_rule_sql)
  26. if data:
  27. rule_list = json.loads(data[0][0])
  28. for item in rule_list:
  29. for key in item:
  30. rule_dict[key] = item[key]
  31. self.logger.logging(
  32. code=1000,
  33. message="抓取规则",
  34. data=rule_dict
  35. )
  36. return rule_dict
  37. def get_task_user_list(self):
  38. """
  39. :return: 返回用户列表
  40. """
  41. task_user_list_sql = f"SELECT uid, link from crawler_user_v3 where task_id = {self.task_id};"
  42. uid_list = self.MySQL.select(task_user_list_sql)
  43. user_list = [{"uid": i[0], "link": i[1]} for i in uid_list] if uid_list else []
  44. self.logger.logging(
  45. code=1000,
  46. message="用户列表",
  47. data=user_list
  48. )
  49. return user_list
  50. def start_crawl(self):
  51. """
  52. :return: 爬虫启动脚本
  53. """
  54. rule_dict = self.get_task_rule()
  55. user_list = self.get_task_user_list()
  56. if rule_dict and user_list:
  57. try:
  58. spider_class = spider_map[self.platform][self.mode]
  59. self.logger.logging(code=1003, message="开始一轮抓取")
  60. main_process = spider_class(
  61. platform=self.platform,
  62. mode=self.mode,
  63. rule_dict=rule_dict,
  64. user_list=user_list,
  65. env=self.env
  66. )
  67. main_process.run()
  68. self.logger.logging(code=1004, message="完成一轮抓取")
  69. except Exception as e:
  70. self.logger.logging(code=1006, message="启动爬虫出现错误, 报错原因是: {}".format(e))
  71. if __name__ == "__main__":
  72. parser = argparse.ArgumentParser() # 新建参数解释器对象
  73. parser.add_argument("--task_id")
  74. parser.add_argument("--mode")
  75. parser.add_argument("--platform")
  76. args = parser.parse_args()
  77. M = OnlineManager(task_id=args.task_id, mode=args.mode, platform=args.platform)
  78. M.start_crawl()