run_spider_online.py 2.1 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465
  1. import json
  2. import os
  3. import sys
  4. import argparse
  5. sys.path.append(os.getcwd())
  6. from application.common.mysql import MysqlHelper
  7. from application.config import spider_map
  8. class OnlineManager(object):
  9. def __init__(self, task_id, mode, platform):
  10. self.env = "prod"
  11. self.task_id = task_id
  12. self.mode = mode
  13. self.platform = platform
  14. self.MySQL = MysqlHelper(mode=self.mode, platform=self.platform, env=self.env)
  15. def get_task_rule(self):
  16. """
  17. :return: 返回任务的规则, task_rule
  18. """
  19. rule_dict = {}
  20. task_rule_sql = f"SELECT rule FROM crawler_task_v3 WHERE id = {self.task_id};"
  21. data = self.MySQL.select(task_rule_sql)
  22. if data:
  23. rule_list = json.loads(data[0][0])
  24. for item in rule_list:
  25. for key in item:
  26. rule_dict[key] = item[key]
  27. return rule_dict
  28. def get_task_user_list(self):
  29. """
  30. :return: 返回用户列表
  31. """
  32. task_user_list_sql = f"SELECT uid, link from crawler_user_v3 where task_id = {self.task_id};"
  33. uid_list = self.MySQL.select(task_user_list_sql)
  34. user_list = [{"uid": i[0], "link": i[1]} for i in uid_list] if uid_list else []
  35. return user_list
  36. def start_crawl(self):
  37. rule_dict = self.get_task_rule()
  38. user_list = self.get_task_user_list()
  39. if rule_dict and user_list:
  40. spider_class = spider_map[self.platform][self.mode]
  41. main_process = spider_class(
  42. platform=self.platform,
  43. mode=self.mode,
  44. rule_dict=rule_dict,
  45. user_list=user_list,
  46. env=self.env
  47. )
  48. main_process.run()
  49. if __name__ == "__main__":
  50. parser = argparse.ArgumentParser() ## 新建参数解释器对象
  51. parser.add_argument("--task_id")
  52. parser.add_argument("--mode")
  53. parser.add_argument("--platform")
  54. args = parser.parse_args()
  55. M = OnlineManager(task_id=args.task_id, mode=args.mode, platform=args.platform)
  56. M.start_crawl()