crawler_scheduling.py 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103
  1. # -*- coding: utf-8 -*-
  2. # @Author: wangkun
  3. # @Time: 2023/3/2
  4. import os
  5. import sys
  6. import time
  7. sys.path.append(os.getcwd())
  8. from common.common import Common
  9. from common.db import MysqlHelper
  10. class Scheduling:
  11. # 任务列表
  12. task_list = []
  13. # 读取任务表
  14. @classmethod
  15. def get_task(cls, log_type, crawler, env, machine):
  16. get_sql = """ select * from crawler_task_1 """
  17. all_task_list = MysqlHelper.get_values(log_type=log_type, crawler=crawler, sql=get_sql, env=env, machine=machine)
  18. pre_task_list = []
  19. for task in all_task_list:
  20. task_id = task[0]
  21. task_name = task[1]
  22. source = task[2]
  23. next_time = task[3]
  24. interval_piaoquan = task[4]
  25. spider_rule = task[5]
  26. task_type = task[6]
  27. spider_link = task[7]
  28. spider_name = task[8]
  29. min_publish_time = task[9]
  30. min_publish_day = task[10]
  31. media_id = task[11]
  32. applets_status = task[12]
  33. app_status = task[13]
  34. user_tag = task[14]
  35. user_content_tag = task[15]
  36. machine = task[16]
  37. insert_time = task[17]
  38. update_time = task[18]
  39. if next_time >= int(time.time()):
  40. task_dict = {
  41. "task_id": task_id,
  42. "task_name": task_name,
  43. "source": source,
  44. "next_time": next_time,
  45. "interval_piaoquan": interval_piaoquan,
  46. "spider_rule": spider_rule,
  47. "task_type": task_type,
  48. "spider_link": spider_link,
  49. "spider_name": spider_name,
  50. "min_publish_time": min_publish_time,
  51. "min_publish_day": min_publish_day,
  52. "media_id": media_id,
  53. "applets_status": applets_status,
  54. "app_status": app_status,
  55. "user_tag": user_tag,
  56. "user_content_tag": user_content_tag,
  57. "machine": machine,
  58. "insert_time": insert_time,
  59. "update_time": update_time,
  60. }
  61. pre_task_list.append(task_dict)
  62. return pre_task_list
  63. # 组装任务
  64. @classmethod
  65. def update_task(cls, log_type, crawler, env, machine):
  66. pre_task_list = cls.get_task(log_type=log_type, crawler=crawler, env=env, machine=machine)
  67. if len(pre_task_list) == 0:
  68. Common.logger(log_type, crawler).info("暂无新任务\n")
  69. else:
  70. for i in range(len(pre_task_list)):
  71. task_id = pre_task_list[i]["task_id"]
  72. task_name = pre_task_list[i]["task_name"]
  73. next_time = pre_task_list[i]["next_time"]
  74. interval_piaoquan = pre_task_list[i]["interval_piaoquan"]
  75. spider_rule = pre_task_list[i]["spider_rule"]
  76. print(f"task_id:{task_id}")
  77. print(f"task_name:{task_name}")
  78. print(f"next_time:{next_time}")
  79. print(f"interval_piaoquan:{interval_piaoquan}")
  80. print(f"spider_rule:{spider_rule}\n")
  81. # 资源分配
  82. @classmethod
  83. def resource_allocation(cls, log_type, crawler, env, machine):
  84. pass
  85. # 写入任务队列
  86. @classmethod
  87. def write_to_queue(cls):
  88. pass
  89. if __name__ == "__main__":
  90. # task_list = Scheduling.get_task("Scheduling", "scheduling", "dev", "local")
  91. # print(task_list)
  92. Scheduling.update_task("Scheduling", "scheduling", "dev", "local")
  93. pass