2 anni fa · 8ad5949f3e
--- a/gongzhonghao/gongzhonghao_author/gongzhonghao1_author.py
+++ b/gongzhonghao/gongzhonghao_author/gongzhonghao1_author.py
@@ -489,8 +489,8 @@ class GongzhonghaoAuthor1:
 
				                 time.sleep(2)
			
 
				                 continue
			
 
				             len_sheet = len(user_sheet)
			
 
				-            if len_sheet >= 101:
			
 
				-                len_sheet = 101
			
 
				+            if len_sheet >= 141:
			
 
				+                len_sheet = 141
			
 
				             for i in range(1, len_sheet):
			
 
				                 user_dict = cls.get_users(log_type=log_type,
			
 
				                                           crawler=crawler,
			
--- a/gongzhonghao/gongzhonghao_author/gongzhonghao2_author.py
+++ b/gongzhonghao/gongzhonghao_author/gongzhonghao2_author.py
@@ -486,13 +486,13 @@ class GongzhonghaoAuthor2:
 
				                 time.sleep(2)
			
 
				                 continue
			
 
				             len_sheet = len(user_sheet)
			
 
				-            if len_sheet <= 101:
			
 
				+            if len_sheet <= 141:
			
 
				                 Common.logger(log_type, crawler).info("抓取用户数<=100，无需启动第二套抓取脚本\n")
			
 
				                 Common.logging(log_type, crawler, env, "抓取用户数<=100，无需启动第二套抓取脚本\n")
			
 
				                 return
			
 
				-            if len_sheet >= 201:
			
 
				-                len_sheet = 201
			
 
				-            for i in range(101, len_sheet):
			
 
				+            if len_sheet >= 261:
			
 
				+                len_sheet = 261
			
 
				+            for i in range(141, len_sheet):
			
 
				                 user_dict = cls.get_users(log_type=log_type,
			
 
				                                           crawler=crawler,
			
 
				                                           user_sheet=user_sheet,
			
--- a/gongzhonghao/gongzhonghao_author/gongzhonghao3_author.py
+++ b/gongzhonghao/gongzhonghao_author/gongzhonghao3_author.py
@@ -487,13 +487,13 @@ class GongzhonghaoAuthor3:
 
				                 time.sleep(2)
			
 
				                 continue
			
 
				             len_sheet = len(user_sheet)
			
 
				-            if len_sheet <= 201:
			
 
				+            if len_sheet <= 261:
			
 
				                 Common.logger(log_type, crawler).info("抓取用户数<=200，无需启动第三套抓取脚本\n")
			
 
				                 Common.logging(log_type, crawler, env, "抓取用户数<=200，无需启动第三套抓取脚本\n")
			
 
				                 return
			
 
				-            if len_sheet >= 301:
			
 
				-                len_sheet = 301
			
 
				-            for i in range(201, len_sheet):
			
 
				+            if len_sheet >= 361:
			
 
				+                len_sheet = 361
			
 
				+            for i in range(261, len_sheet):
			
 
				                 user_dict = cls.get_users(log_type=log_type,
			
 
				                                           crawler=crawler,
			
 
				                                           user_sheet=user_sheet,
			
--- a/gongzhonghao/gongzhonghao_author/gongzhonghao4_author.py
+++ b/gongzhonghao/gongzhonghao_author/gongzhonghao4_author.py
@@ -488,13 +488,13 @@ class GongzhonghaoAuthor4:
 
				                 time.sleep(2)
			
 
				                 continue
			
 
				             len_sheet = len(user_sheet)
			
 
				-            if len_sheet <= 301:
			
 
				+            if len_sheet <= 361:
			
 
				                 Common.logger(log_type, crawler).info("抓取用户数<=300，无需启动第四套抓取脚本\n")
			
 
				                 Common.logging(log_type, crawler, env, "抓取用户数<=300，无需启动第四套抓取脚本\n")
			
 
				                 return
			
 
				-            if len_sheet >= 401:
			
 
				-                len_sheet = 401
			
 
				-            for i in range(301, len_sheet):
			
 
				+            if len_sheet >= 461:
			
 
				+                len_sheet = 461
			
 
				+            for i in range(361, len_sheet):
			
 
				                 user_dict = cls.get_users(log_type=log_type,
			
 
				                                           crawler=crawler,
			
 
				                                           user_sheet=user_sheet,
			
--- a/gongzhonghao/gongzhonghao_author/gongzhonghao5_author.py
+++ b/gongzhonghao/gongzhonghao_author/gongzhonghao5_author.py
@@ -488,13 +488,13 @@ class GongzhonghaoAuthor5:
 
				                 time.sleep(2)
			
 
				                 continue
			
 
				             len_sheet = len(user_sheet)
			
 
				-            if len_sheet <= 401:
			
 
				+            if len_sheet <= 461:
			
 
				                 Common.logger(log_type, crawler).info("抓取用户数<=400，无需启动第五套抓取脚本\n")
			
 
				                 Common.logging(log_type, crawler, env, "抓取用户数<=400，无需启动第五套抓取脚本\n")
			
 
				                 return
			
 
				             # if len_sheet >= 501:
			
 
				             #     len_sheet = 501
			
 
				-            for i in range(401, len_sheet):
			
 
				+            for i in range(461, len_sheet):
			
 
				                 user_dict = cls.get_users(log_type=log_type,
			
 
				                                           crawler=crawler,
			
 
				                                           user_sheet=user_sheet,
			
--- a/gongzhonghao/gongzhonghao_author/gongzhonghao_author.py
+++ b/gongzhonghao/gongzhonghao_author/gongzhonghao_author.py
@@ -223,7 +223,7 @@ class GongzhonghaoAuthor:
 
				                 Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}, get_videoList:{r.text}\n")
			
 
				                 Common.logging(log_type, crawler, env, f"status_code:{r.status_code}, get_videoList:{r.text}\n")
			
 
				                 if 20 >= datetime.datetime.now().hour >= 10:
			
 
				-                    Feishu.bot(log_type, crawler,f"公众号:{user_dict['link']}, {user_dict['nick_name']}\n抓取异常, 请检查该公众号\n")
			
 
				+                    Feishu.bot(log_type, crawler,f"公众号:{user_dict['link']}, 站内昵称:{user_dict['nick_name']}\n抓取异常, 请检查该公众号\n")
			
 
				                 return
			
 
				             if 'app_msg_list' not in r.json():
			
 
				                 Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}, get_videoList:{r.text}\n")
			
@@ -454,8 +454,8 @@ class GongzhonghaoAuthor:
 
				     @classmethod
			
 
				     def get_all_videos(cls, log_type, crawler, token_index, rule_dict, user_list, env):
			
 
				         for user_dict in user_list:
			
 
				-            Common.logger(log_type, crawler).info(f'获取 {user_dict["nick_name"]} 公众号视频\n')
			
 
				-            Common.logging(log_type, crawler, env, f'获取 {user_dict["nick_name"]} 公众号视频\n')
			
 
				+            Common.logger(log_type, crawler).info(f'获取:{user_dict["nick_name"]} 公众号视频\n')
			
 
				+            Common.logging(log_type, crawler, env, f'获取:{user_dict["nick_name"]} 公众号视频\n')
			
 
				             try:
			
 
				                 cls.get_videoList(log_type=log_type,
			
 
				                                   crawler=crawler,
			
--- a/gongzhonghao/gongzhonghao_main/run_gzh1_author_dev.py
+++ b/gongzhonghao/gongzhonghao_main/run_gzh1_author_dev.py
@@ -0,0 +1,32 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+# @Author: wangkun
			
 
				+# @Time: 2023/6/20
			
 
				+import os
			
 
				+import sys
			
 
				+sys.path.append(os.getcwd())
			
 
				+from common.common import Common
			
 
				+# from gongzhonghao.gongzhonghao_author.gongzhonghao1_author import GongzhonghaoAuthor1
			
 
				+# from gongzhonghao.gongzhonghao_author.gongzhonghao2_author import GongzhonghaoAuthor2
			
 
				+# from gongzhonghao.gongzhonghao_author.gongzhonghao3_author import GongzhonghaoAuthor3
			
 
				+# from gongzhonghao.gongzhonghao_author.gongzhonghao4_author import GongzhonghaoAuthor4
			
 
				+from gongzhonghao.gongzhonghao_author.gongzhonghao5_author import GongzhonghaoAuthor5
			
 
				+
			
 
				+
			
 
				+def gzh_main(log_type, crawler, env):
			
 
				+    Common.logger(log_type, crawler).info("开始抓取:公众号")
			
 
				+    Common.logging(log_type, crawler, env, "开始抓取:公众号")
			
 
				+    # GongzhonghaoAuthor1.get_all_videos(log_type=log_type,
			
 
				+    # GongzhonghaoAuthor2.get_all_videos(log_type=log_type,
			
 
				+    # GongzhonghaoAuthor3.get_all_videos(log_type=log_type,
			
 
				+    # GongzhonghaoAuthor4.get_all_videos(log_type=log_type,
			
 
				+    GongzhonghaoAuthor5.get_all_videos(log_type=log_type,
			
 
				+                                       crawler=crawler,
			
 
				+                                       rule_dict={"period": {"max": 1, "min": 1}, "duration": {"max": 2700, "min": 20}},
			
 
				+                                       env=env)
			
 
				+    Common.del_logs(log_type, crawler)
			
 
				+    Common.logger(log_type, crawler).info('抓取一轮结束\n')
			
 
				+    Common.logging(log_type, crawler, env, '抓取一轮结束\n')
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    gzh_main(log_type="author", crawler="gongzhonghao", env="dev")
			
--- a/gongzhonghao/gongzhonghao_main/run_gzh_author.py
+++ b/gongzhonghao/gongzhonghao_main/run_gzh_author.py
@@ -98,8 +98,8 @@ def main(log_type, crawler, topic_name, group_id, env):
 
				                 # 多进程并行抓取
			
 
				                 processes = []
			
 
				                 for i in range(crawler_num):
			
 
				-                    start = i * chunk_size + 1
			
 
				-                    end = min((i + 1) * chunk_size + 1, user_num + 1)
			
 
				+                    start = i * chunk_size
			
 
				+                    end = min((i + 1) * chunk_size, user_num + 1)
			
 
				                     process = Process(target=get_author_videos, args=(f"{log_type}{i+1}", crawler, i+1, task_dict, rule_dict, user_list[start:end], env))
			
 
				                     process.start()
			
 
				                     processes.append(process)
			
--- a/gongzhonghao/gongzhonghao_main/run_gzh_author_dev.py
+++ b/gongzhonghao/gongzhonghao_main/run_gzh_author_dev.py
@@ -1,32 +1,68 @@
 
				 # -*- coding: utf-8 -*-
			
 
				 # @Author: wangkun
			
 
				-# @Time: 2023/6/20
			
 
				-import os
			
 
				-import sys
			
 
				-sys.path.append(os.getcwd())
			
 
				+# @Time: 2023/7/3
			
 
				 from common.common import Common
			
 
				-# from gongzhonghao.gongzhonghao_author.gongzhonghao1_author import GongzhonghaoAuthor1
			
 
				-# from gongzhonghao.gongzhonghao_author.gongzhonghao2_author import GongzhonghaoAuthor2
			
 
				-# from gongzhonghao.gongzhonghao_author.gongzhonghao3_author import GongzhonghaoAuthor3
			
 
				-# from gongzhonghao.gongzhonghao_author.gongzhonghao4_author import GongzhonghaoAuthor4
			
 
				-from gongzhonghao.gongzhonghao_author.gongzhonghao5_author import GongzhonghaoAuthor5
			
 
				+from common.scheduling_db import MysqlHelper
			
 
				+from common.public import task_fun_mq
			
 
				+from gongzhonghao.gongzhonghao_author.gongzhonghao_author import GongzhonghaoAuthor
			
 
				 
			
 
				 
			
 
				-def gzh_main(log_type, crawler, env):
			
 
				-    Common.logger(log_type, crawler).info("开始抓取:公众号")
			
 
				-    Common.logging(log_type, crawler, env, "开始抓取:公众号")
			
 
				-    # GongzhonghaoAuthor1.get_all_videos(log_type=log_type,
			
 
				-    # GongzhonghaoAuthor2.get_all_videos(log_type=log_type,
			
 
				-    # GongzhonghaoAuthor3.get_all_videos(log_type=log_type,
			
 
				-    # GongzhonghaoAuthor4.get_all_videos(log_type=log_type,
			
 
				-    GongzhonghaoAuthor5.get_all_videos(log_type=log_type,
			
 
				+def get_author_videos(log_type, crawler, token_index, task_dict, rule_dict, user_list, env):
			
 
				+    Common.logger(log_type, crawler).info(f'开始抓取:{task_dict["taskName"]}\n')
			
 
				+    Common.logging(log_type, crawler, env, f'开始抓取:{task_dict["taskName"]}\n')
			
 
				+    Common.logger(log_type, crawler).info(f"user_list:{user_list}")
			
 
				+    Common.logging(log_type, crawler, env, f"user_list:{user_list}")
			
 
				+    GongzhonghaoAuthor.get_all_videos(log_type=log_type,
			
 
				                                        crawler=crawler,
			
 
				-                                       rule_dict={"period": {"max": 1, "min": 1}, "duration": {"max": 2700, "min": 20}},
			
 
				+                                       token_index = token_index,
			
 
				+                                       rule_dict=rule_dict,
			
 
				+                                       user_list = user_list,
			
 
				                                        env=env)
			
 
				     Common.del_logs(log_type, crawler)
			
 
				     Common.logger(log_type, crawler).info('抓取一轮结束\n')
			
 
				     Common.logging(log_type, crawler, env, '抓取一轮结束\n')
			
 
				 
			
 
				+def main(log_type, crawler, topic_name, group_id, env):
			
 
				 
			
 
				-if __name__ == "__main__":
			
 
				-    gzh_main(log_type="author", crawler="gongzhonghao", env="dev")
			
 
				+            # # 解析 task_dict
			
 
				+            # task_dict = task_fun_mq(msg.message_body)['task_dict']
			
 
				+            # Common.logger(log_type, crawler).info(f"调度任务:{task_dict}")
			
 
				+            # Common.logging(log_type, crawler, env, f"调度任务:{task_dict}")
			
 
				+            #
			
 
				+            # # 解析 rule_dict
			
 
				+            # rule_dict = task_fun_mq(msg.message_body)['rule_dict']
			
 
				+            # Common.logger(log_type, crawler).info(f"抓取规则:{rule_dict}\n")
			
 
				+            # Common.logging(log_type, crawler, env, f"抓取规则:{rule_dict}\n")
			
 
				+            #
			
 
				+            # # 解析 user_list
			
 
				+            # task_id = task_dict['id']
			
 
				+            # select_user_sql = f"""select * from crawler_user_v3 where task_id={task_id}"""
			
 
				+            # user_list = MysqlHelper.get_values(log_type, crawler, select_user_sql, env, action="")
			
 
				+            #
			
 
				+            # # 计算启动脚本数 crawler_num
			
 
				+            # user_num = len(user_list)
			
 
				+            # chunk_size = 100  # 每个进程处理的用户数量
			
 
				+            # crawler_num = int(user_num // chunk_size)  # 向下取整
			
 
				+            # if user_num % chunk_size != 0:
			
 
				+            #     crawler_num += 1
			
 
				+            # Common.logger(log_type, crawler).info(f"共{user_num}个公众号，需要启动{crawler_num}个脚本任务")
			
 
				+            # Common.logging(log_type, crawler, env, f"共{user_num}个公众号，需要启动{crawler_num}个脚本任务")
			
 
				+            #
			
 
				+            # # 多进程并行抓取
			
 
				+            # processes = []
			
 
				+            # for i in range(crawler_num):
			
 
				+            #     start = i * chunk_size
			
 
				+            #     end = min((i + 1) * chunk_size, user_num + 1)
			
 
				+            #     process = Process(target=get_author_videos, args=(f"{log_type}{i+1}", crawler, i+1, task_dict, rule_dict, user_list[start:end], env))
			
 
				+            #     process.start()
			
 
				+            #     processes.append(process)
			
 
				+            #
			
 
				+            # for process in processes:
			
 
				+            #     process.join()
			
 
				+            #
			
 
				+            #
			
 
				+            # Common.logger(log_type, crawler).info(f"Consume Message Fail! Exception:{err}\n")
			
 
				+            # Common.logging(log_type, crawler, env, f"Consume Message Fail! Exception:{err}\n")
			
 
				+            # time.sleep(2)
			
 
				+            # continue
			
 
				+            pass
			
--- a/gongzhonghao/gongzhonghao_main/run_test.py
+++ b/gongzhonghao/gongzhonghao_main/run_test.py
@@ -2,26 +2,33 @@
 
				 # @Author: wangkun
			
 
				 # @Time: 2023/6/30
			
 
				 import datetime
			
 
				-import multiprocessing
			
 
				-import time
			
 
				 from multiprocessing import Process
			
 
				 
			
 
				 
			
 
				-def script():
			
 
				-    while True:
			
 
				-        print(f"{time.strftime('%H:%M:%S')}:==========")
			
 
				-        time.sleep(10)
			
 
				+def get_author_videos(user_list):
			
 
				+    print(f"{datetime.datetime.now()}:{user_list}")
			
 
				 
			
 
				 
			
 
				-def test(gzh_num):
			
 
				-    crawler_num = int(gzh_num/100)
			
 
				-    if gzh_num%100 != 0:
			
 
				+def test_list():
			
 
				+    user_list = ["a", "b", "c", "d", "e", "f", 1]
			
 
				+    user_num = len(user_list)
			
 
				+    chunk_size = 2  # 每个进程处理的用户数量
			
 
				+    crawler_num = int(user_num // chunk_size)  # 向下取整
			
 
				+    if user_num % chunk_size != 0:
			
 
				         crawler_num += 1
			
 
				-    print(crawler_num)
			
 
				+    print(f"crawler_num:{crawler_num}")
			
 
				+
			
 
				+    processes = []
			
 
				     for i in range(crawler_num):
			
 
				-        process = Process(target=script)
			
 
				+        start = i * chunk_size
			
 
				+        end = min((i + 1) * chunk_size, user_num + 1)
			
 
				+        process = Process(target=get_author_videos, args=(user_list[start:end],))
			
 
				         process.start()
			
 
				+        processes.append(process)
			
 
				+
			
 
				+    for process in processes:
			
 
				+        process.join()
			
 
				 
			
 
				 if __name__ == "__main__":
			
 
				-    print(datetime.date.today())
			
 
				-    # test(692)
			
 
				+    test_list()
			
 
				+    pass
			
--- a/xigua/xigua_main/run_xg_search.py
+++ b/xigua/xigua_main/run_xg_search.py
@@ -9,7 +9,7 @@ sys.path.append(os.getcwd())
 
				 from common.common import Common
			
 
				 from common.public import get_consumer, ack_message, task_fun_mq
			
 
				 from common.scheduling_db import MysqlHelper
			
 
				-from xigua.xigua_search.xigua_search_scheduling import XiguasearchScheduling
			
 
				+from xigua.xigua_search.xigua_search import XiguasearchScheduling
			
 
				 
			
 
				 
			
 
				 def main(log_type, crawler, topic_name, group_id, env):
			
--- a/xigua/xigua_main/run_xg_search_dev.py
+++ b/xigua/xigua_main/run_xg_search_dev.py
@@ -5,7 +5,7 @@ import os
 
				 import sys
			
 
				 sys.path.append(os.getcwd())
			
 
				 from common.common import Common
			
 
				-from xigua.xigua_search.xigua_search_scheduling import XiguasearchScheduling
			
 
				+from xigua.xigua_search.xigua_search import XiguasearchScheduling
			
 
				 
			
 
				 
			
 
				 def xigua_search_main(log_type, crawler, env):
			
--- a/xigua/xigua_search/xigua_search_scheduling.py
+++ b/xigua/xigua_search/xigua_search_scheduling.py
--- a/xigua/xigua_search/xigua_search_scheduling0628.py
+++ b/xigua/xigua_search/xigua_search_scheduling0628.py