Browse Source

上线账号抓取v2

luojunhui 5 months ago
parent
commit
86aae49380
3 changed files with 111 additions and 2 deletions
  1. 4 0
      applications/const.py
  2. 29 0
      applications/functions.py
  3. 78 2
      coldStartTasks/crawler/weixin_account_crawler.py

+ 4 - 0
applications/const.py

@@ -104,4 +104,8 @@ class WeixinVideoCrawlerConst:
     # 接口请求成功code
     REQUEST_SUCCESS = 0
 
+    # 是否需要扫描查询源账号
+    NEED_SCAN_SOURCE_ACCOUNT = 1
+    DO_NOT_NEED_SOURCE_ACCOUNT = 0
+
 

+ 29 - 0
applications/functions.py

@@ -185,6 +185,35 @@ class Functions(object):
         )
         return url
 
+    @classmethod
+    def get_source_account(cls, article_url: str) -> dict:
+        """
+        获取公众号名称和头像
+        :param article_url:
+        :return:
+        """
+        response = requests.get(
+            url=article_url,
+            headers={'User-Agent': FakeUserAgent().random},
+        )
+        html_text = response.text
+        # 正则表达式用于提取 hit_nickname 和 hit_username
+        regex_nickname = r"hit_nickname:\s*'([^']+)'"
+        regex_username = r"hit_username:\s*'([^']+)'"
+
+        # 提取 hit_nickname 和 hit_username
+        nickname = re.search(regex_nickname, html_text)
+        username = re.search(regex_username, html_text)
+
+        # 输出提取的结果
+        if nickname and username:
+            return {
+                'name': nickname.group(1),
+                'gh_id': username.group(1)
+            }
+        else:
+            return {}
+
     @classmethod
     def download_gzh_video(cls, article_url):
         """

+ 78 - 2
coldStartTasks/crawler/weixin_account_crawler.py

@@ -3,7 +3,7 @@
 """
 import time
 import traceback
-from typing import List, Set, Dict
+from typing import List, Set, Dict, Tuple
 
 from tqdm import tqdm
 
@@ -37,6 +37,32 @@ class WeixinAccountCrawler(object):
             account_name_set.add(account_name_obj['account_name'])
         return account_name_set
 
+    def get_crawler_articles(self) -> List[Dict]:
+        """
+        获取已经抓取到的文章,判断其是否有链接账号,若有则继续抓账号
+        :return:
+        """
+        sql = f"""
+            SELECT id, article_url
+            FROM publish_single_video_source
+            WHERE source_account = {const.NEED_SCAN_SOURCE_ACCOUNT};
+        """
+        article_url_list = self.db_client.select_json(sql)
+        return article_url_list
+
+    def update_crawler_article_status(self, article_id_tuple: Tuple[int, ...]) -> int:
+        """
+        :param article_id_tuple:
+        :return:
+        """
+        sql = """
+            UPDATE publish_single_video_source
+            SET source_account = %s
+            WHERE id IN %s;
+            """
+        affected_rows = self.db_client.update(sql, (const.DO_NOT_NEED_SOURCE_ACCOUNT, article_id_tuple))
+        return affected_rows
+
     def get_seed_titles(self) -> List[str]:
         """
         :return:
@@ -166,7 +192,7 @@ class WeixinAccountCrawler(object):
 
         # 通知
         bot(
-            title="微信账号抓取完成",
+            title="微信账号抓取V1完成",
             detail={
                 "总更新账号数量": self.crawler_account_count,
                 "总耗时": time.time() - start_time,
@@ -174,3 +200,53 @@ class WeixinAccountCrawler(object):
             },
             mention=False
         )
+
+    def run_v2(self) -> None:
+        """
+        入口函数
+        :return:
+        """
+        # get article list
+        crawler_article_list = self.get_crawler_articles()
+        article_id_list = []
+        insert_account_count = 0
+        for crawler_article_obj in tqdm(crawler_article_list, desc="crawler article list"):
+            try:
+                article_id = crawler_article_obj['id']
+                article_url = crawler_article_obj['article_url']
+                # 判断文章是否原创
+                if self.is_original(article_url):
+                    continue
+                try:
+                    source_account_info = function.get_source_account(article_url)
+                except Exception as e:
+                    continue
+                if not source_account_info:
+                    continue
+                if source_account_info:
+                    account_name = source_account_info['name']
+                    gh_id = source_account_info['gh_id']
+                    affected_rows = self.insert_account(gh_id=gh_id, account_name=account_name)
+                    insert_account_count += affected_rows
+                else:
+                    continue
+
+                # 记录处理过的id
+                article_id_list.append(int(article_id))
+            except Exception as e:
+                print(e)
+                print(traceback.format_exc())
+
+        article_id_tuple = tuple(article_id_list)
+        affected_rows = self.update_crawler_article_status(article_id_tuple)
+
+        bot(
+            title="微信账号抓取V2完成",
+            detail={
+                "扫描文章数量": len(crawler_article_list),
+                "新增账号数量": insert_account_count
+            },
+            mention=False
+        )
+
+