""" @author: luojunhui 抓取全局品类文章 """ import time from tqdm import tqdm from applications import WeixinSpider, Functions, DeNetMysql, PQMySQL class weixinCategory(object): """ 微信全局品类账号抓取 """ def __init__(self): self.db_client_pq = PQMySQL() self.db_client_dt = DeNetMysql() self.spider = WeixinSpider() self.function = Functions() def getAccountList(self, account_category): """ 获取账号 :param account_category 品类 :return: """ sql = f""" select distinct gh_id, account_source, account_name, account_category, latest_update_time from long_articles_accounts where account_category = '{account_category}'; """ account_tuple = self.db_client_pq.select(sql) result = [ { "gh_id": i[0], "platform": i[1], "account_name": i[2], "category": i[3], "latest_timestamp": i[4], } for i in account_tuple ] return result def updateDataIntoMysql(self, gh_id, category, article_list): """ 将数据更新到数据库 :return: """ for article_obj in article_list: detail_article_list = article_obj["AppMsg"]["DetailInfo"] for obj in detail_article_list: try: show_stat = self.function.show_desc_to_sta(obj["ShowDesc"]) show_view_count = show_stat.get("show_view_count", 0) show_like_count = show_stat.get("show_like_count", 0) insert_sql = f""" insert into crawler_meta_article (platform, mode, category, out_account_id, article_index, title, link, read_cnt, like_cnt, description, publish_time, crawler_time, status, unique_index) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s); """ self.db_client_dt.update( sql=insert_sql, params=( "weixin", "account", category, gh_id, obj['ItemIndex'], obj["Title"], obj["ContentUrl"], show_view_count, show_like_count, obj["Digest"], obj["send_time"], int(time.time()), 1, self.function.generateGzhId(obj["ContentUrl"]), ), ) except Exception as e: print(e) def updateLatestAccountTimeStamp(self, gh_id): """ 更新账号的最新时间戳 :return: """ select_sql = f""" SELECT publish_time From crawler_meta_article WHERE out_account_id = '{gh_id}' ORDER BY publish_time DESC LIMIT 1; """ result = self.db_client_dt.select(select_sql) time_stamp = result[0][0] dt_str = self.function.time_stamp_to_str(time_stamp) update_sql = f""" update long_articles_accounts set latest_update_time = %s where gh_id = %s; """ self.db_client_pq.update(sql=update_sql, params=(dt_str, gh_id)) def updateEachAccountArticles(self, gh_id, category, latest_time_stamp, index=None): """ 更新账号文章 :return: """ response = self.spider.update_msg_list(ghId=gh_id, index=index) msg_list = response.get("data", {}).get("data") if msg_list: last_article_in_this_msg = msg_list[-1] self.updateDataIntoMysql( gh_id=gh_id, category=category, article_list=msg_list ) last_time_stamp_in_this_msg = last_article_in_this_msg["AppMsg"]["BaseInfo"]["UpdateTime"] if latest_time_stamp < last_time_stamp_in_this_msg: next_cursor = response["data"]["next_cursor"] return self.updateEachAccountArticles( gh_id=gh_id, latest_time_stamp=latest_time_stamp, category=category, index=next_cursor, ) else: # 更新最近抓取时间 self.updateLatestAccountTimeStamp(gh_id=gh_id) print("账号时间更新成功") else: print("No more data") if __name__ == "__main__": wxCategory = weixinCategory() category_list = [ '军事', '历史', '娱乐八卦', '情感生活', '健康养生', # '新闻媒体' ] for category in category_list: account_list = wxCategory.getAccountList(category) for account in tqdm(account_list): try: gh_id = account['gh_id'] category = account['category'] try: timestamp = int(account['latest_timestamp'].timestamp()) except Exception as e: timestamp = 1704038400 wxCategory.updateEachAccountArticles( gh_id=gh_id, category=category, latest_time_stamp=timestamp ) print("success") except Exception as e: print("fail because of {}".format(e))