""" @author: luojunhui @file: gzh_article_crawler.py @time: 2025/04/25 14:20 @desc: 抓取公众号文章 """ from __future__ import annotations import time import traceback from pymysql.cursors import DictCursor from tqdm import tqdm from applications import log from applications.api.gzh_api import get_gzh_account_article_list from applications.db import DatabaseConnector from applications.utils import Item from applications.utils import insert_into_article_meta_table from config import long_articles_config class GZHArticleCrawler: def __init__(self): self.db_client = DatabaseConnector(db_config=long_articles_config) self.db_client.connect() def crawler_each_article(self, gh_id, category, article): article_item = Item() article_item.add("title", article["Title"]) article_item.add("platform", "weixin") article_item.add("mode", "account") article_item.add("category", category) article_item.add("out_account_id", gh_id) article_item.add("article_index", article["ItemIndex"]) article_item.add("link", article["ContentUrl"]) # article_item.add("read_cnt", article["ShowViewCount"]) # article_item.add("like_cnt", article["ShowLikeCount"]) article_item.add("description", article["Digest"]) article_item.add("publish_time", article["send_time"]) article_item.add("crawler_time", int(time.time())) article_item.add("status", 1) # article_item.add("unique_index", str_to_md5(article["ContentUrl"])) article_item.add("llm_sensitivity", -1) article_item.add("title_sensitivity", -1) # check item meta_item = article_item.check(source="article") insert_into_article_meta_table(db_client=self.db_client, article_item=meta_item) def insert_msg_into_article_meta_table(self, gh_id, category, msg_list): for article_msg in tqdm(msg_list, desc=f"crawler : {gh_id}"): article_list = article_msg["AppMsg"]["DetailInfo"] for article in article_list: try: self.crawler_each_article(gh_id, category, article) except Exception as e: print(e) print(traceback.format_exc()) def crawl_each_gzh_account_article_list( self, account_id: str, category: str, latest_update_timestamp: int ): next_cursor = None while True: fetch_response = get_gzh_account_article_list(account_id, next_cursor) account_msg_list = fetch_response.get("data", {}).get("data") if not account_msg_list: break self.insert_msg_into_article_meta_table( account_id, category, account_msg_list ) last_article = account_msg_list[-1] last_timestamp_in_page = last_article["AppMsg"]["BaseInfo"]["UpdateTime"] if latest_update_timestamp < last_timestamp_in_page: # update account timestamp break else: next_cursor = fetch_response["data"].get("next_cursor") if not next_cursor: break