crawler_scheduling.py 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596
  1. # -*- coding: utf-8 -*-
  2. # @Author: wangkun
  3. # @Time: 2023/3/2
  4. import os
  5. import sys
  6. import time
  7. sys.path.append(os.getcwd())
  8. from common.common import Common
  9. from common.scheduling_db import MysqlHelper, RedisHelper
  10. class Scheduling:
  11. # 读取任务表
  12. @classmethod
  13. def get_task(cls, log_type, crawler, env, machine):
  14. get_sql = """ select * from crawler_task """
  15. all_task_list = MysqlHelper.get_values(log_type=log_type, crawler=crawler, sql=get_sql, env=env, machine=machine)
  16. pre_task_list = []
  17. for task in all_task_list:
  18. if int(time.time()) >= task["next_time"]:
  19. pre_task_list.append(task)
  20. return pre_task_list
  21. # 更新下次启动时间,调用时机:调度该 task_id 的任务时
  22. @classmethod
  23. def update_task(cls, log_type, crawler, task_id, next_time, interval_piaoquan, env, machine):
  24. if interval_piaoquan > 0:
  25. new_next_time = next_time + interval_piaoquan
  26. update_sql = f""" UPDATE crawler_task SET next_time={new_next_time} WHERE task_id={task_id} """
  27. MysqlHelper.update_values(log_type, crawler, update_sql, env, machine)
  28. # 资源分配 / 组装
  29. @classmethod
  30. def write_redis(cls, log_type, crawler, env, machine):
  31. pre_task_list = cls.get_task(log_type=log_type, crawler=crawler, env=env, machine=machine)
  32. if len(pre_task_list) == 0:
  33. Common.logger(log_type, crawler).info("暂无新任务\n")
  34. else:
  35. for pre_task in pre_task_list:
  36. if machine == "hk":
  37. # 写入 redis
  38. pass
  39. elif machine == "aliyun":
  40. # 写入 redis
  41. pass
  42. else:
  43. # 写入 redis
  44. RedisHelper.redis_push(env, machine,str(pre_task))
  45. @classmethod
  46. def get_redis(cls, log_type, crawler, env, machine):
  47. redis_data = RedisHelper.redis_pop(env, machine)
  48. if redis_data is None or len(redis_data) == 0:
  49. Common.logger(log_type, crawler).info("Redis为空,等待1秒")
  50. time.sleep(1)
  51. else:
  52. task = eval(str(redis_data, encoding="utf8"))
  53. return task
  54. @classmethod
  55. def scheduling_task(cls, log_type, crawler, env, machine):
  56. task = cls.get_redis(log_type, crawler, env, machine)
  57. Common.logger(log_type, crawler).info(f"task: {task}")
  58. Common.logger(log_type, crawler).info(f"已获取调度任务:{task}")
  59. task_id = task['task_id']
  60. source = task['source']
  61. next_time = task['next_time']
  62. interval_piaoquan = task['interval_piaoquan']
  63. spider_name = task['spider_name']
  64. if machine == "aliyun":
  65. oss_endpoint = "inner"
  66. elif machine == "aliyun_hk":
  67. oss_endpoint = "hk"
  68. else:
  69. oss_endpoint = "out"
  70. if int(time.time()) >= next_time:
  71. cls.update_task(log_type, crawler, task_id, next_time, interval_piaoquan, env, machine)
  72. # 正式环境,调度任务
  73. Common.logger(log_type, crawler).info(f"开始调度任务:{task}\n")
  74. task_str = [('task_id', str(task_id)), ('task_name', str(task['task_name'])), ('source', str(task['source'])), ('next_time', str(task['next_time'])), ('interval_piaoquan', str(task['interval_piaoquan'])), ('play_cnt', eval(task['spider_rule'])['play_cnt']),('video_width', eval(task['spider_rule'])['video_width']),('video_height', eval(task['spider_rule'])['video_height']),('video_like', eval(task['spider_rule'])['video_like']),('share_cnt', eval(task['spider_rule'])['share_cnt']),('duration_min', eval(task['spider_rule'])['duration']['min']),('duration_max', eval(task['spider_rule'])['duration']['max']),('task_type', task['task_type']),('spider_link', eval(task['spider_link'])),('spider_name', str(task['spider_name'])),('min_publish_time', str(task['min_publish_time'])),('min_publish_day', str(task['min_publish_day'])),('media_id', str(task['media_id'])),('applets_status', str(task['applets_status'])),('app_status', str(task['app_status'])),('user_tag', str(task['user_tag'])),('user_content_tag',str(task['user_content_tag'])),('machine', str(task['machine']))]
  75. task_str = str(task_str).replace(' ', '')
  76. cmd = f"""sh scheduling/scheduling_main/scheduling.sh {source}/{source}_main/{spider_name}_scheduling.py --log_type="{spider_name}" --crawler="{source}" --task="{str(task_str)}" --oss_endpoint="{oss_endpoint}" --env="{env}" --machine="{machine}" {source}/{source}-nohup.log """
  77. Common.logger(log_type, crawler).info(f"cmd:{cmd}\n")
  78. os.system(cmd)
  79. if __name__ == "__main__":
  80. # print(Scheduling.get_task("scheduling", "scheduling", "dev", "local"))
  81. # print(Scheduling.get_redis("scheduling", "scheduling", "dev", "local"))
  82. # Scheduling.write_redis("scheduling", "scheduling", "dev", "local")
  83. Scheduling.scheduling_task("scheduling", "scheduling", "dev", "local")
  84. pass