12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788 |
- """
- @author: luojunhui
- @file: gzh_article_crawler.py
- @time: 2025/04/25 14:20
- @desc: 抓取公众号文章
- """
- from __future__ import annotations
- import time
- import traceback
- from pymysql.cursors import DictCursor
- from tqdm import tqdm
- from applications import log
- from applications.api.gzh_api import get_gzh_account_article_list
- from applications.db import DatabaseConnector
- from applications.utils import Item
- from applications.utils import insert_into_article_meta_table
- from config import long_articles_config
- class GZHArticleCrawler:
- def __init__(self):
- self.db_client = DatabaseConnector(db_config=long_articles_config)
- self.db_client.connect()
- def crawler_each_article(self, gh_id, category, article):
- article_item = Item()
- article_item.add("title", article["Title"])
- article_item.add("platform", "weixin")
- article_item.add("mode", "account")
- article_item.add("category", category)
- article_item.add("out_account_id", gh_id)
- article_item.add("article_index", article["ItemIndex"])
- article_item.add("link", article["ContentUrl"])
- # article_item.add("read_cnt", article["ShowViewCount"])
- # article_item.add("like_cnt", article["ShowLikeCount"])
- article_item.add("description", article["Digest"])
- article_item.add("publish_time", article["send_time"])
- article_item.add("crawler_time", int(time.time()))
- article_item.add("status", 1)
- # article_item.add("unique_index", str_to_md5(article["ContentUrl"]))
- article_item.add("llm_sensitivity", -1)
- article_item.add("title_sensitivity", -1)
- # check item
- meta_item = article_item.check(source="article")
- insert_into_article_meta_table(db_client=self.db_client, article_item=meta_item)
- def insert_msg_into_article_meta_table(self, gh_id, category, msg_list):
- for article_msg in tqdm(msg_list, desc=f"crawler : {gh_id}"):
- article_list = article_msg["AppMsg"]["DetailInfo"]
- for article in article_list:
- try:
- self.crawler_each_article(gh_id, category, article)
- except Exception as e:
- print(e)
- print(traceback.format_exc())
- def crawl_each_gzh_account_article_list(
- self, account_id: str, category: str, latest_update_timestamp: int
- ):
- next_cursor = None
- while True:
- fetch_response = get_gzh_account_article_list(account_id, next_cursor)
- account_msg_list = fetch_response.get("data", {}).get("data")
- if not account_msg_list:
- break
- self.insert_msg_into_article_meta_table(
- account_id, category, account_msg_list
- )
- last_article = account_msg_list[-1]
- last_timestamp_in_page = last_article["AppMsg"]["BaseInfo"]["UpdateTime"]
- if latest_update_timestamp < last_timestamp_in_page:
- # update account timestamp
- break
- else:
- next_cursor = fetch_response["data"].get("next_cursor")
- if not next_cursor:
- break
|