run_spider_online.py 2.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869
  1. import json
  2. import os
  3. import sys
  4. import argparse
  5. sys.path.append(os.getcwd())
  6. from application.common.mysql import MysqlHelper
  7. from spider.spider_map import spider_map
  8. class OnlineManager(object):
  9. """
  10. 线上爬虫模版
  11. Todo: 加上阿里云日志;
  12. """
  13. def __init__(self, task_id, mode, platform):
  14. self.env = "prod"
  15. self.task_id = task_id
  16. self.mode = mode
  17. self.platform = platform
  18. self.MySQL = MysqlHelper(mode=self.mode, platform=self.platform, env=self.env)
  19. def get_task_rule(self):
  20. """
  21. :return: 返回任务的规则, task_rule
  22. """
  23. rule_dict = {}
  24. task_rule_sql = f"SELECT rule FROM crawler_task_v3 WHERE id = {self.task_id};"
  25. data = self.MySQL.select(task_rule_sql)
  26. if data:
  27. rule_list = json.loads(data[0][0])
  28. for item in rule_list:
  29. for key in item:
  30. rule_dict[key] = item[key]
  31. return rule_dict
  32. def get_task_user_list(self):
  33. """
  34. :return: 返回用户列表
  35. """
  36. task_user_list_sql = f"SELECT uid, link from crawler_user_v3 where task_id = {self.task_id};"
  37. uid_list = self.MySQL.select(task_user_list_sql)
  38. user_list = [{"uid": i[0], "link": i[1]} for i in uid_list] if uid_list else []
  39. return user_list
  40. def start_crawl(self):
  41. rule_dict = self.get_task_rule()
  42. user_list = self.get_task_user_list()
  43. if rule_dict and user_list:
  44. spider_class = spider_map[self.platform][self.mode]
  45. main_process = spider_class(
  46. platform=self.platform,
  47. mode=self.mode,
  48. rule_dict=rule_dict,
  49. user_list=user_list,
  50. env=self.env
  51. )
  52. main_process.run()
  53. if __name__ == "__main__":
  54. parser = argparse.ArgumentParser() ## 新建参数解释器对象
  55. parser.add_argument("--task_id")
  56. parser.add_argument("--mode")
  57. parser.add_argument("--platform")
  58. args = parser.parse_args()
  59. M = OnlineManager(task_id=args.task_id, mode=args.mode, platform=args.platform)
  60. M.start_crawl()