wangkun преди 1 година
родител
ревизия
8ad5949f3e

+ 2 - 2
gongzhonghao/gongzhonghao_author/gongzhonghao1_author.py

@@ -489,8 +489,8 @@ class GongzhonghaoAuthor1:
                 time.sleep(2)
                 continue
             len_sheet = len(user_sheet)
-            if len_sheet >= 101:
-                len_sheet = 101
+            if len_sheet >= 141:
+                len_sheet = 141
             for i in range(1, len_sheet):
                 user_dict = cls.get_users(log_type=log_type,
                                           crawler=crawler,

+ 4 - 4
gongzhonghao/gongzhonghao_author/gongzhonghao2_author.py

@@ -486,13 +486,13 @@ class GongzhonghaoAuthor2:
                 time.sleep(2)
                 continue
             len_sheet = len(user_sheet)
-            if len_sheet <= 101:
+            if len_sheet <= 141:
                 Common.logger(log_type, crawler).info("抓取用户数<=100,无需启动第二套抓取脚本\n")
                 Common.logging(log_type, crawler, env, "抓取用户数<=100,无需启动第二套抓取脚本\n")
                 return
-            if len_sheet >= 201:
-                len_sheet = 201
-            for i in range(101, len_sheet):
+            if len_sheet >= 261:
+                len_sheet = 261
+            for i in range(141, len_sheet):
                 user_dict = cls.get_users(log_type=log_type,
                                           crawler=crawler,
                                           user_sheet=user_sheet,

+ 4 - 4
gongzhonghao/gongzhonghao_author/gongzhonghao3_author.py

@@ -487,13 +487,13 @@ class GongzhonghaoAuthor3:
                 time.sleep(2)
                 continue
             len_sheet = len(user_sheet)
-            if len_sheet <= 201:
+            if len_sheet <= 261:
                 Common.logger(log_type, crawler).info("抓取用户数<=200,无需启动第三套抓取脚本\n")
                 Common.logging(log_type, crawler, env, "抓取用户数<=200,无需启动第三套抓取脚本\n")
                 return
-            if len_sheet >= 301:
-                len_sheet = 301
-            for i in range(201, len_sheet):
+            if len_sheet >= 361:
+                len_sheet = 361
+            for i in range(261, len_sheet):
                 user_dict = cls.get_users(log_type=log_type,
                                           crawler=crawler,
                                           user_sheet=user_sheet,

+ 4 - 4
gongzhonghao/gongzhonghao_author/gongzhonghao4_author.py

@@ -488,13 +488,13 @@ class GongzhonghaoAuthor4:
                 time.sleep(2)
                 continue
             len_sheet = len(user_sheet)
-            if len_sheet <= 301:
+            if len_sheet <= 361:
                 Common.logger(log_type, crawler).info("抓取用户数<=300,无需启动第四套抓取脚本\n")
                 Common.logging(log_type, crawler, env, "抓取用户数<=300,无需启动第四套抓取脚本\n")
                 return
-            if len_sheet >= 401:
-                len_sheet = 401
-            for i in range(301, len_sheet):
+            if len_sheet >= 461:
+                len_sheet = 461
+            for i in range(361, len_sheet):
                 user_dict = cls.get_users(log_type=log_type,
                                           crawler=crawler,
                                           user_sheet=user_sheet,

+ 2 - 2
gongzhonghao/gongzhonghao_author/gongzhonghao5_author.py

@@ -488,13 +488,13 @@ class GongzhonghaoAuthor5:
                 time.sleep(2)
                 continue
             len_sheet = len(user_sheet)
-            if len_sheet <= 401:
+            if len_sheet <= 461:
                 Common.logger(log_type, crawler).info("抓取用户数<=400,无需启动第五套抓取脚本\n")
                 Common.logging(log_type, crawler, env, "抓取用户数<=400,无需启动第五套抓取脚本\n")
                 return
             # if len_sheet >= 501:
             #     len_sheet = 501
-            for i in range(401, len_sheet):
+            for i in range(461, len_sheet):
                 user_dict = cls.get_users(log_type=log_type,
                                           crawler=crawler,
                                           user_sheet=user_sheet,

+ 3 - 3
gongzhonghao/gongzhonghao_author/gongzhonghao_author.py

@@ -223,7 +223,7 @@ class GongzhonghaoAuthor:
                 Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}, get_videoList:{r.text}\n")
                 Common.logging(log_type, crawler, env, f"status_code:{r.status_code}, get_videoList:{r.text}\n")
                 if 20 >= datetime.datetime.now().hour >= 10:
-                    Feishu.bot(log_type, crawler,f"公众号:{user_dict['link']}, {user_dict['nick_name']}\n抓取异常, 请检查该公众号\n")
+                    Feishu.bot(log_type, crawler,f"公众号:{user_dict['link']}, 站内昵称:{user_dict['nick_name']}\n抓取异常, 请检查该公众号\n")
                 return
             if 'app_msg_list' not in r.json():
                 Common.logger(log_type, crawler).warning(f"status_code:{r.status_code}, get_videoList:{r.text}\n")
@@ -454,8 +454,8 @@ class GongzhonghaoAuthor:
     @classmethod
     def get_all_videos(cls, log_type, crawler, token_index, rule_dict, user_list, env):
         for user_dict in user_list:
-            Common.logger(log_type, crawler).info(f'获取 {user_dict["nick_name"]} 公众号视频\n')
-            Common.logging(log_type, crawler, env, f'获取 {user_dict["nick_name"]} 公众号视频\n')
+            Common.logger(log_type, crawler).info(f'获取:{user_dict["nick_name"]} 公众号视频\n')
+            Common.logging(log_type, crawler, env, f'获取:{user_dict["nick_name"]} 公众号视频\n')
             try:
                 cls.get_videoList(log_type=log_type,
                                   crawler=crawler,

+ 32 - 0
gongzhonghao/gongzhonghao_main/run_gzh1_author_dev.py

@@ -0,0 +1,32 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2023/6/20
+import os
+import sys
+sys.path.append(os.getcwd())
+from common.common import Common
+# from gongzhonghao.gongzhonghao_author.gongzhonghao1_author import GongzhonghaoAuthor1
+# from gongzhonghao.gongzhonghao_author.gongzhonghao2_author import GongzhonghaoAuthor2
+# from gongzhonghao.gongzhonghao_author.gongzhonghao3_author import GongzhonghaoAuthor3
+# from gongzhonghao.gongzhonghao_author.gongzhonghao4_author import GongzhonghaoAuthor4
+from gongzhonghao.gongzhonghao_author.gongzhonghao5_author import GongzhonghaoAuthor5
+
+
+def gzh_main(log_type, crawler, env):
+    Common.logger(log_type, crawler).info("开始抓取:公众号")
+    Common.logging(log_type, crawler, env, "开始抓取:公众号")
+    # GongzhonghaoAuthor1.get_all_videos(log_type=log_type,
+    # GongzhonghaoAuthor2.get_all_videos(log_type=log_type,
+    # GongzhonghaoAuthor3.get_all_videos(log_type=log_type,
+    # GongzhonghaoAuthor4.get_all_videos(log_type=log_type,
+    GongzhonghaoAuthor5.get_all_videos(log_type=log_type,
+                                       crawler=crawler,
+                                       rule_dict={"period": {"max": 1, "min": 1}, "duration": {"max": 2700, "min": 20}},
+                                       env=env)
+    Common.del_logs(log_type, crawler)
+    Common.logger(log_type, crawler).info('抓取一轮结束\n')
+    Common.logging(log_type, crawler, env, '抓取一轮结束\n')
+
+
+if __name__ == "__main__":
+    gzh_main(log_type="author", crawler="gongzhonghao", env="dev")

+ 2 - 2
gongzhonghao/gongzhonghao_main/run_gzh_author.py

@@ -98,8 +98,8 @@ def main(log_type, crawler, topic_name, group_id, env):
                 # 多进程并行抓取
                 processes = []
                 for i in range(crawler_num):
-                    start = i * chunk_size + 1
-                    end = min((i + 1) * chunk_size + 1, user_num + 1)
+                    start = i * chunk_size
+                    end = min((i + 1) * chunk_size, user_num + 1)
                     process = Process(target=get_author_videos, args=(f"{log_type}{i+1}", crawler, i+1, task_dict, rule_dict, user_list[start:end], env))
                     process.start()
                     processes.append(process)

+ 56 - 20
gongzhonghao/gongzhonghao_main/run_gzh_author_dev.py

@@ -1,32 +1,68 @@
 # -*- coding: utf-8 -*-
 # @Author: wangkun
-# @Time: 2023/6/20
-import os
-import sys
-sys.path.append(os.getcwd())
+# @Time: 2023/7/3
 from common.common import Common
-# from gongzhonghao.gongzhonghao_author.gongzhonghao1_author import GongzhonghaoAuthor1
-# from gongzhonghao.gongzhonghao_author.gongzhonghao2_author import GongzhonghaoAuthor2
-# from gongzhonghao.gongzhonghao_author.gongzhonghao3_author import GongzhonghaoAuthor3
-# from gongzhonghao.gongzhonghao_author.gongzhonghao4_author import GongzhonghaoAuthor4
-from gongzhonghao.gongzhonghao_author.gongzhonghao5_author import GongzhonghaoAuthor5
+from common.scheduling_db import MysqlHelper
+from common.public import task_fun_mq
+from gongzhonghao.gongzhonghao_author.gongzhonghao_author import GongzhonghaoAuthor
 
 
-def gzh_main(log_type, crawler, env):
-    Common.logger(log_type, crawler).info("开始抓取:公众号")
-    Common.logging(log_type, crawler, env, "开始抓取:公众号")
-    # GongzhonghaoAuthor1.get_all_videos(log_type=log_type,
-    # GongzhonghaoAuthor2.get_all_videos(log_type=log_type,
-    # GongzhonghaoAuthor3.get_all_videos(log_type=log_type,
-    # GongzhonghaoAuthor4.get_all_videos(log_type=log_type,
-    GongzhonghaoAuthor5.get_all_videos(log_type=log_type,
+def get_author_videos(log_type, crawler, token_index, task_dict, rule_dict, user_list, env):
+    Common.logger(log_type, crawler).info(f'开始抓取:{task_dict["taskName"]}\n')
+    Common.logging(log_type, crawler, env, f'开始抓取:{task_dict["taskName"]}\n')
+    Common.logger(log_type, crawler).info(f"user_list:{user_list}")
+    Common.logging(log_type, crawler, env, f"user_list:{user_list}")
+    GongzhonghaoAuthor.get_all_videos(log_type=log_type,
                                        crawler=crawler,
-                                       rule_dict={"period": {"max": 1, "min": 1}, "duration": {"max": 2700, "min": 20}},
+                                       token_index = token_index,
+                                       rule_dict=rule_dict,
+                                       user_list = user_list,
                                        env=env)
     Common.del_logs(log_type, crawler)
     Common.logger(log_type, crawler).info('抓取一轮结束\n')
     Common.logging(log_type, crawler, env, '抓取一轮结束\n')
 
+def main(log_type, crawler, topic_name, group_id, env):
 
-if __name__ == "__main__":
-    gzh_main(log_type="author", crawler="gongzhonghao", env="dev")
+            # # 解析 task_dict
+            # task_dict = task_fun_mq(msg.message_body)['task_dict']
+            # Common.logger(log_type, crawler).info(f"调度任务:{task_dict}")
+            # Common.logging(log_type, crawler, env, f"调度任务:{task_dict}")
+            #
+            # # 解析 rule_dict
+            # rule_dict = task_fun_mq(msg.message_body)['rule_dict']
+            # Common.logger(log_type, crawler).info(f"抓取规则:{rule_dict}\n")
+            # Common.logging(log_type, crawler, env, f"抓取规则:{rule_dict}\n")
+            #
+            # # 解析 user_list
+            # task_id = task_dict['id']
+            # select_user_sql = f"""select * from crawler_user_v3 where task_id={task_id}"""
+            # user_list = MysqlHelper.get_values(log_type, crawler, select_user_sql, env, action="")
+            #
+            # # 计算启动脚本数 crawler_num
+            # user_num = len(user_list)
+            # chunk_size = 100  # 每个进程处理的用户数量
+            # crawler_num = int(user_num // chunk_size)  # 向下取整
+            # if user_num % chunk_size != 0:
+            #     crawler_num += 1
+            # Common.logger(log_type, crawler).info(f"共{user_num}个公众号,需要启动{crawler_num}个脚本任务")
+            # Common.logging(log_type, crawler, env, f"共{user_num}个公众号,需要启动{crawler_num}个脚本任务")
+            #
+            # # 多进程并行抓取
+            # processes = []
+            # for i in range(crawler_num):
+            #     start = i * chunk_size
+            #     end = min((i + 1) * chunk_size, user_num + 1)
+            #     process = Process(target=get_author_videos, args=(f"{log_type}{i+1}", crawler, i+1, task_dict, rule_dict, user_list[start:end], env))
+            #     process.start()
+            #     processes.append(process)
+            #
+            # for process in processes:
+            #     process.join()
+            #
+            #
+            # Common.logger(log_type, crawler).info(f"Consume Message Fail! Exception:{err}\n")
+            # Common.logging(log_type, crawler, env, f"Consume Message Fail! Exception:{err}\n")
+            # time.sleep(2)
+            # continue
+            pass

+ 20 - 13
gongzhonghao/gongzhonghao_main/run_test.py

@@ -2,26 +2,33 @@
 # @Author: wangkun
 # @Time: 2023/6/30
 import datetime
-import multiprocessing
-import time
 from multiprocessing import Process
 
 
-def script():
-    while True:
-        print(f"{time.strftime('%H:%M:%S')}:==========")
-        time.sleep(10)
+def get_author_videos(user_list):
+    print(f"{datetime.datetime.now()}:{user_list}")
 
 
-def test(gzh_num):
-    crawler_num = int(gzh_num/100)
-    if gzh_num%100 != 0:
+def test_list():
+    user_list = ["a", "b", "c", "d", "e", "f", 1]
+    user_num = len(user_list)
+    chunk_size = 2  # 每个进程处理的用户数量
+    crawler_num = int(user_num // chunk_size)  # 向下取整
+    if user_num % chunk_size != 0:
         crawler_num += 1
-    print(crawler_num)
+    print(f"crawler_num:{crawler_num}")
+
+    processes = []
     for i in range(crawler_num):
-        process = Process(target=script)
+        start = i * chunk_size
+        end = min((i + 1) * chunk_size, user_num + 1)
+        process = Process(target=get_author_videos, args=(user_list[start:end],))
         process.start()
+        processes.append(process)
+
+    for process in processes:
+        process.join()
 
 if __name__ == "__main__":
-    print(datetime.date.today())
-    # test(692)
+    test_list()
+    pass

+ 1 - 1
xigua/xigua_main/run_xg_search.py

@@ -9,7 +9,7 @@ sys.path.append(os.getcwd())
 from common.common import Common
 from common.public import get_consumer, ack_message, task_fun_mq
 from common.scheduling_db import MysqlHelper
-from xigua.xigua_search.xigua_search_scheduling import XiguasearchScheduling
+from xigua.xigua_search.xigua_search import XiguasearchScheduling
 
 
 def main(log_type, crawler, topic_name, group_id, env):

+ 1 - 1
xigua/xigua_main/run_xg_search_dev.py

@@ -5,7 +5,7 @@ import os
 import sys
 sys.path.append(os.getcwd())
 from common.common import Common
-from xigua.xigua_search.xigua_search_scheduling import XiguasearchScheduling
+from xigua.xigua_search.xigua_search import XiguasearchScheduling
 
 
 def xigua_search_main(log_type, crawler, env):

+ 0 - 0
xigua/xigua_search/xigua_search_scheduling.py → xigua/xigua_search/xigua_search.py


+ 0 - 0
xigua/xigua_search/xigua_search_scheduling0628.py → xigua/xigua_search/xigua_search_publish_time.py