%!s(int64=2) %!d(string=hai) anos · 5b68f1fa52
--- a/README.MD
+++ b/README.MD
@@ -2,14 +2,13 @@
 
				 
			
 
				 ### 启动
			
 
				 1. cd ./piaoquan_crawler
			
 
				-2. sh ./main/scheduling_main.sh ${crawler_dir} ${log_type} ${crawler} ${env} ${machine} >>${nohup_dir} 2>&1 &
			
 
				+2. sh ./main/scheduling_main.sh ${crawler_dir} ${log_type} ${crawler} ${env} >>${nohup_dir} 2>&1 &
			
 
				 ```commandline
			
 
				 参数说明
			
 
				 ${crawler_dir}:     爬虫执行路径，如: scheduling/scheduling_main/run_write_task.py
			
 
				 ${log_type}:        日志命名格式，如: scheduling-task，则在 scheduling/logs/目录下，生成 2023-02-08-scheduling-task.log
			
 
				 ${crawler}:         哪款爬虫，如: youtube / kanyikan / weixinzhishu
			
 
				 ${env}:             爬虫运行环境，正式环境: prod / 测试环境: dev
			
 
				-${machine}:         爬虫运行机器，阿里云服务器: aliyun_hk / aliyun / local
			
 
				 ${nohup_dir}:       nohup日志存储路径，如: shceduling/nohup-task.log
			
 
				 ```
			
 
				 
			
@@ -20,12 +19,12 @@ sh ./main/scheduling_main.sh scheduling/scheduling_main/run_write_task.py --log_
 
				 sh ./main/scheduling_main.sh scheduling/scheduling_main/run_scheduling_task.py --log_type="scheduling-task" --crawler="scheduling" --env="prod" nohup-task.log 
			
 
				 
			
 
				 香港服务器
			
 
				-sh ./main/scheduling_main.sh scheduling/scheduling_main/run_write_task.py --log_type="scheduling-write" --crawler="scheduling" --env="prod" --machine="aliyun_hk" shceduling/nohup-write.log 
			
 
				-sh ./main/scheduling_main.sh scheduling/scheduling_main/run_scheduling_task.py --log_type="scheduling-task" --crawler="scheduling" --env="prod" --machine="aliyun_hk" shceduling/nohup-task.log 
			
 
				+sh ./main/scheduling_main.sh scheduling/scheduling_main/run_write_task.py --log_type="scheduling-write" --crawler="scheduling" --env="prod" shceduling/nohup-write.log 
			
 
				+sh ./main/scheduling_main.sh scheduling/scheduling_main/run_scheduling_task.py --log_type="scheduling-task" --crawler="scheduling" --env="prod" shceduling/nohup-task.log 
			
 
				 
			
 
				 线下调试
			
 
				-sh ./main/scheduling_main.sh scheduling/scheduling_main/run_write_task.py --log_type="scheduling-write" --crawler="scheduling" --env="dev" --machine="local" ./scheduling/nohup-write.log 
			
 
				-sh ./main/scheduling_main.sh scheduling/scheduling_main/run_scheduling_task.py --log_type="scheduling-task" --crawler="scheduling" --env="dev" --machine="local" ./scheduling/nohup-task.log 
			
 
				+sh ./main/scheduling_main.sh scheduling/scheduling_main/run_write_task.py --log_type="scheduling-write" --crawler="scheduling" --env="dev"  ./scheduling/nohup-write.log 
			
 
				+sh ./main/scheduling_main.sh scheduling/scheduling_main/run_scheduling_task.py --log_type="scheduling-task" --crawler="scheduling" --env="dev"  ./scheduling/nohup-task.log 
			
 
				 
			
 
				 杀进程
			
 
				 ps aux | grep scheduling
			
--- a/scheduling/scheduling_main/crawler_scheduling.py
+++ b/scheduling/scheduling_main/crawler_scheduling.py
@@ -40,7 +40,9 @@ class Scheduling:
 
				             for pre_task in pre_task_list:
			
 
				                 # machine字段是用来区分海外爬虫和国内爬虫使用的，不涉及任何其他含义
			
 
				                 machine = pre_task.get('machine', 'dev')
			
 
				-
			
 
				+                next_time = pre_task['next_time']
			
 
				+                interval_piaoquan = pre_task['interval_piaoquan']
			
 
				+                task_id = pre_task['task_id']
			
 
				                 if machine == "hk":
			
 
				                     # 写入 redis
			
 
				                     task_key = 'crawler_config_task_queue:hk'
			
@@ -53,6 +55,8 @@ class Scheduling:
 
				                     # 写入 redis
			
 
				                     task_key = 'crawler_config_task_queue:dev'
			
 
				                     RedisHelper.redis_push(env, task_key, str(pre_task))
			
 
				+                if int(time.time()) >= next_time:
			
 
				+                    cls.update_task(log_type, crawler, task_id, next_time, interval_piaoquan, env)
			
 
				 
			
 
				     @classmethod
			
 
				     def get_redis(cls, log_type, crawler, env):
			
@@ -79,8 +83,7 @@ class Scheduling:
 
				         Common.logger(log_type, crawler).info(f"已获取调度任务:{task}")
			
 
				         task_id = task['task_id']
			
 
				         source = task['source']
			
 
				-        next_time = task['next_time']
			
 
				-        interval_piaoquan = task['interval_piaoquan']
			
 
				+
			
 
				         spider_name = task['spider_name']
			
 
				         if env == "aliyun":
			
 
				             oss_endpoint = "inner"
			
@@ -89,30 +92,28 @@ class Scheduling:
 
				         else:
			
 
				             oss_endpoint = "out"
			
 
				 
			
 
				-        if int(time.time()) >= next_time:
			
 
				-            cls.update_task(log_type, crawler, task_id, next_time, interval_piaoquan, env)
			
 
				-            # 正式环境，调度任务
			
 
				-            Common.logger(log_type, crawler).info(f"开始调度任务:{task}\n")
			
 
				-            task_str = [('task_id', str(task_id)), ('task_name', str(task['task_name'])),
			
 
				-                        ('source', str(task['source'])), ('next_time', str(task['next_time'])),
			
 
				-                        ('interval_piaoquan', str(task['interval_piaoquan'])),
			
 
				-                        ('play_cnt', eval(task['spider_rule'])['play_cnt']),
			
 
				-                        ('video_width', eval(task['spider_rule'])['video_width']),
			
 
				-                        ('video_height', eval(task['spider_rule'])['video_height']),
			
 
				-                        ('video_like', eval(task['spider_rule'])['video_like']),
			
 
				-                        ('share_cnt', eval(task['spider_rule'])['share_cnt']),
			
 
				-                        ('duration_min', eval(task['spider_rule'])['duration']['min']),
			
 
				-                        ('duration_max', eval(task['spider_rule'])['duration']['max']),
			
 
				-                        ('task_type', task['task_type']), ('spider_link', eval(task['spider_link'])),
			
 
				-                        ('spider_name', str(task['spider_name'])), ('min_publish_time', str(task['min_publish_time'])),
			
 
				-                        ('min_publish_day', str(task['min_publish_day'])), ('media_id', str(task['media_id'])),
			
 
				-                        ('applets_status', str(task['applets_status'])), ('app_status', str(task['app_status'])),
			
 
				-                        ('user_tag', str(task['user_tag'])), ('user_content_tag', str(task['user_content_tag'])),
			
 
				-                        ('machine', str(task['machine']))]
			
 
				-            task_str = str(task_str).replace(' ', '')
			
 
				-            cmd = f"""sh scheduling/scheduling_main/scheduling.sh {source}/{source}_main/{spider_name}_scheduling.py --log_type="{spider_name}" --crawler="{source}" --task="{str(task_str)}" --oss_endpoint="{oss_endpoint}" --env="{env}" {source}/{source}-nohup.log """
			
 
				-            Common.logger(log_type, crawler).info(f"cmd:{cmd}\n")
			
 
				-            os.system(cmd)
			
 
				+        # 正式环境，调度任务
			
 
				+        Common.logger(log_type, crawler).info(f"开始调度任务:{task}\n")
			
 
				+        task_str = [('task_id', str(task_id)), ('task_name', str(task['task_name'])),
			
 
				+                    ('source', str(task['source'])), ('next_time', str(task['next_time'])),
			
 
				+                    ('interval_piaoquan', str(task['interval_piaoquan'])),
			
 
				+                    ('play_cnt', eval(task['spider_rule'])['play_cnt']),
			
 
				+                    ('video_width', eval(task['spider_rule'])['video_width']),
			
 
				+                    ('video_height', eval(task['spider_rule'])['video_height']),
			
 
				+                    ('video_like', eval(task['spider_rule'])['video_like']),
			
 
				+                    ('share_cnt', eval(task['spider_rule'])['share_cnt']),
			
 
				+                    ('duration_min', eval(task['spider_rule'])['duration']['min']),
			
 
				+                    ('duration_max', eval(task['spider_rule'])['duration']['max']),
			
 
				+                    ('task_type', task['task_type']), ('spider_link', eval(task['spider_link'])),
			
 
				+                    ('spider_name', str(task['spider_name'])), ('min_publish_time', str(task['min_publish_time'])),
			
 
				+                    ('min_publish_day', str(task['min_publish_day'])), ('media_id', str(task['media_id'])),
			
 
				+                    ('applets_status', str(task['applets_status'])), ('app_status', str(task['app_status'])),
			
 
				+                    ('user_tag', str(task['user_tag'])), ('user_content_tag', str(task['user_content_tag'])),
			
 
				+                    ('machine', str(task['machine']))]
			
 
				+        task_str = str(task_str).replace(' ', '')
			
 
				+        cmd = f"""sh scheduling/scheduling_main/scheduling.sh {source}/{source}_main/{spider_name}_scheduling.py --log_type="{spider_name}" --crawler="{source}" --task="{str(task_str)}" --oss_endpoint="{oss_endpoint}" --env="{env}" {source}/{source}-nohup.log """
			
 
				+        Common.logger(log_type, crawler).info(f"cmd:{cmd}\n")
			
 
				+        os.system(cmd)
			
 
				 
			
 
				 
			
 
				 if __name__ == "__main__":
			
--- a/scheduling/scheduling_main/run_scheduling_task.py
+++ b/scheduling/scheduling_main/run_scheduling_task.py
@@ -14,12 +14,10 @@ from scheduling.scheduling_main.crawler_scheduling import Scheduling
 
				 class SchedulingTask:
			
 
				     @classmethod
			
 
				     def scheduling_task(cls, log_type, crawler, env):
			
 
				-        while True:
			
 
				-            Common.logger(log_type, crawler).info("开始调度爬虫任务")
			
 
				-            Scheduling.scheduling_task(log_type, crawler, env)
			
 
				-            Common.logger(log_type, crawler).info("爬虫任务调度完成")
			
 
				-            Common.del_logs(log_type, crawler)
			
 
				-            time.sleep(60)
			
 
				+        Common.logger(log_type, crawler).info("开始调度爬虫任务")
			
 
				+        Scheduling.scheduling_task(log_type, crawler, env)
			
 
				+        Common.logger(log_type, crawler).info("爬虫任务调度完成")
			
 
				+        Common.del_logs(log_type, crawler)
			
 
				 
			
 
				 
			
 
				 if __name__ == "__main__":
			
--- a/scheduling/scheduling_main/run_write_task.py
+++ b/scheduling/scheduling_main/run_write_task.py
@@ -14,13 +14,10 @@ from scheduling.scheduling_main.crawler_scheduling import Scheduling
 
				 class WriteTask:
			
 
				     @classmethod
			
 
				     def write_task(cls, log_type, crawler, env):
			
 
				-        while True:
			
 
				-            Common.logger(log_type, crawler).info("开始读取爬虫任务，写入Redis")
			
 
				-            Scheduling.write_redis(log_type=log_type, crawler=crawler, env=env)
			
 
				-            Common.logger(log_type, crawler).info("写入Redis完成")
			
 
				-
			
 
				-            Common.del_logs(log_type, crawler)
			
 
				-            time.sleep(60)
			
 
				+        Common.logger(log_type, crawler).info("开始读取爬虫任务，写入Redis")
			
 
				+        Scheduling.write_redis(log_type=log_type, crawler=crawler, env=env)
			
 
				+        Common.logger(log_type, crawler).info("写入Redis完成")
			
 
				+        Common.del_logs(log_type, crawler)
			
 
				 
			
 
				 
			
 
				 if __name__ == "__main__":