Explorar el Código

新增公众新号-调试

zhangyong hace 1 año
padre
commit
1cc8bd53c5

+ 8 - 11
gongzhongxinhao/gongzhongxinhao/gongzhongxinhao_author.py

@@ -23,13 +23,12 @@ class GZXHAuthor:
     """
     公众新号账号爬虫
     """
-    def __init__(self, platform, mode, rule_dict, user_dict, env, url_id):
+    def __init__(self, platform, mode, rule_dict, user_dict, env):
         self.platform = platform
         self.mode = mode
         self.rule_dict = rule_dict
         self.user_dict = user_dict
         self.env = env
-        self.url_id = url_id
         self.download_cnt = 0
         self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
 
@@ -39,7 +38,7 @@ class GZXHAuthor:
                 platform=self.platform,
                 mode=self.mode,
                 env=self.env,
-                message="开始抓取公众新号: {}".format(self.url_id["name"]),
+                message="开始抓取公众新号: {}".format(self.user_dict['link']),
             )
             try:
                 self.get_videoList()
@@ -49,14 +48,14 @@ class GZXHAuthor:
                     platform=self.platform,
                     mode=self.mode,
                     env=self.env,
-                    message=f"抓取公众新号: {self.url_id['name']} 时异常,异常信息: {e}",
+                    message=f"抓取公众新号: {self.user_dict['link']} 时异常,异常信息: {e}",
                 )
             AliyunLogger.logging(
                 code="1004",
                 platform=self.platform,
                 mode=self.mode,
                 env=self.env,
-                message="抓取公众新号: {}".format(self.url_id["name"]),
+                message="抓取公众新号: {}".format(self.user_dict['link']),
             )
 
             # 获取腾讯视频下载链接
@@ -145,7 +144,7 @@ class GZXHAuthor:
     def get_videoList(self):
         mq = MQ(topic_name="topic_crawler_etl_" + self.env)
         time.sleep(1)
-        wechat_gh = self.get_wechat_gh(self.url_id["url"])
+        wechat_gh = self.get_wechat_gh(self.user_dict['link'])
         if None == wechat_gh:
 
             AliyunLogger.logging(
@@ -153,7 +152,7 @@ class GZXHAuthor:
                 platform=self.platform,
                 mode=self.mode,
                 env=self.env,
-                message=f"获取用主页为空{self.url_id['name']}",
+                message=f"获取用主页为空{self.user_dict['link']}",
             )
             return
         time.sleep(1)
@@ -221,7 +220,6 @@ class GZXHAuthor:
         date_time_obj = datetime.strptime(publish_time_str, date_format)
         publish_time_stamp = int(date_time_obj.timestamp())
         article_url = article.get("url", "")
-        our_user = random.choice(self.user_dict)
         video_url = self.get_video_url(article_url)
         video_dict = {
             "user_name": user_name,
@@ -237,7 +235,7 @@ class GZXHAuthor:
             "comment_cnt": 0,
             "like_cnt": 0,
             "share_cnt": 0,
-            "user_id": our_user["uid"],
+            "user_id": self.user_dict["uid"],
             "cover_url": article.get("head_pic", ""),
             "video_url": video_url,
             "width": 0,
@@ -279,8 +277,7 @@ class GZXHAuthor:
                 publish_time_str,
                 video_url,
                 article.get("head_pic", ""),
-                self.url_id["name"],
-                self.url_id["url"]
+                self.user_dict['link']
 
             ]]
             Feishu.insert_columns('gongzhonghao', 'gongzhonghao', "9QU7wE", "ROWS", 1, 2)

+ 5 - 15
gongzhongxinhao/gongzhongxinhao_main/run_gzxh_author.py

@@ -79,15 +79,6 @@ def main(log_type, crawler, topic_name, group_id, env):
                     env=env,
                     message=f"抓取规则:{rule_dict}\n",
                 )
-                audio_type = Feishu.get_values_batch("prod", "gongzhonghao", "QsTym9")
-                url_list = []
-                for row in audio_type[1:]:
-                    name = row[1]
-                    url = row[4]
-                    number = {"name": name, "url": url}
-                    if url:
-                        url_list.append(number)
-
                 # 解析 user_list
                 task_id = task_dict["id"]
                 select_user_sql = (
@@ -103,7 +94,7 @@ def main(log_type, crawler, topic_name, group_id, env):
                     env=env,
                     message="开始抓取"
                 )
-                for url_id in url_list:
+                for user_dict in user_list:
                     time.sleep(random.randint(1, 5))
                     try:
                         AliyunLogger.logging(
@@ -111,16 +102,15 @@ def main(log_type, crawler, topic_name, group_id, env):
                             platform=crawler,
                             mode=log_type,
                             env=env,
-                            message="开始抓取公众新号{}".format(url_id["name"]),
+                            message="开始抓取公众新号{}".format(user_dict),
                         )
                         # 初始化
                         GZXH = GZXHAuthor(
                             platform=crawler,
                             mode=log_type,
                             rule_dict=rule_dict,
-                            user_dict=user_list,
+                            user_dict=user_dict,
                             env=env,
-                            url_id=url_id,
                         )
                         GZXH.get_account_videos()
                         AliyunLogger.logging(
@@ -128,7 +118,7 @@ def main(log_type, crawler, topic_name, group_id, env):
                             platform=crawler,
                             mode=log_type,
                             env=env,
-                            message="完成抓取公众新号{}".format(url_id["name"]),
+                            message="完成抓取公众新号{}".format(user_dict),
                         )
                     except Exception as e:
                         AliyunLogger.logging(
@@ -136,7 +126,7 @@ def main(log_type, crawler, topic_name, group_id, env):
                             platform=crawler,
                             mode=log_type,
                             env=env,
-                            message="抓取公众新号{}出现问题, 报错为{}".format(url_id["name"], e),
+                            message="抓取公众新号{}出现问题, 报错为{}".format(user_dict, e),
                         )