123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165 |
- """
- @author: luojunhui
- 抓取全局品类文章
- """
- import time
- from tqdm import tqdm
- from applications import WeixinSpider, Functions, DeNetMysql, PQMySQL
- class weixinCategory(object):
- """
- 微信全局品类账号抓取
- """
- def __init__(self):
- self.db_client_pq = PQMySQL()
- self.db_client_dt = DeNetMysql()
- self.spider = WeixinSpider()
- self.function = Functions()
- def getAccountList(self, account_category):
- """
- 获取账号
- :param account_category 品类
- :return:
- """
- sql = f"""
- select distinct gh_id, account_source, account_name, account_category, latest_update_time
- from long_articles_accounts
- where account_category = '{account_category}';
- """
- account_tuple = self.db_client_pq.select(sql)
- result = [
- {
- "gh_id": i[0],
- "platform": i[1],
- "account_name": i[2],
- "category": i[3],
- "latest_timestamp": i[4],
- }
- for i in account_tuple
- ]
- return result
- def updateDataIntoMysql(self, gh_id, category, article_list):
- """
- 将数据更新到数据库
- :return:
- """
- for article_obj in article_list:
- detail_article_list = article_obj["AppMsg"]["DetailInfo"]
- for obj in detail_article_list:
- try:
- show_stat = self.function.show_desc_to_sta(obj["ShowDesc"])
- show_view_count = show_stat.get("show_view_count", 0)
- show_like_count = show_stat.get("show_like_count", 0)
- insert_sql = f"""
- insert into crawler_meta_article
- (platform, mode, category, out_account_id, article_index, title, link, read_cnt, like_cnt, description, publish_time, crawler_time, status, unique_index)
- VALUES
- (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
- """
- self.db_client_dt.update(
- sql=insert_sql,
- params=(
- "weixin",
- "account",
- category,
- gh_id,
- obj['ItemIndex'],
- obj["Title"],
- obj["ContentUrl"],
- show_view_count,
- show_like_count,
- obj["Digest"],
- obj["send_time"],
- int(time.time()),
- 1,
- self.function.generateGzhId(obj["ContentUrl"]),
- ),
- )
- except Exception as e:
- print(e)
- def updateLatestAccountTimeStamp(self, gh_id):
- """
- 更新账号的最新时间戳
- :return:
- """
- select_sql = f"""
- SELECT publish_time
- From crawler_meta_article
- WHERE out_account_id = '{gh_id}'
- ORDER BY publish_time DESC LIMIT 1;
- """
- result = self.db_client_dt.select(select_sql)
- time_stamp = result[0][0]
- dt_str = self.function.time_stamp_to_str(time_stamp)
- update_sql = f"""
- update long_articles_accounts
- set latest_update_time = %s
- where gh_id = %s;
- """
- self.db_client_pq.update(sql=update_sql, params=(dt_str, gh_id))
- def updateEachAccountArticles(self, gh_id, category, latest_time_stamp, index=None):
- """
- 更新账号文章
- :return:
- """
- response = self.spider.update_msg_list(ghId=gh_id, index=index)
- msg_list = response.get("data", {}).get("data")
- if msg_list:
- last_article_in_this_msg = msg_list[-1]
- self.updateDataIntoMysql(
- gh_id=gh_id, category=category, article_list=msg_list
- )
- last_time_stamp_in_this_msg = last_article_in_this_msg["AppMsg"]["BaseInfo"]["UpdateTime"]
- if latest_time_stamp < last_time_stamp_in_this_msg:
- next_cursor = response["data"]["next_cursor"]
- return self.updateEachAccountArticles(
- gh_id=gh_id,
- latest_time_stamp=latest_time_stamp,
- category=category,
- index=next_cursor,
- )
- else:
- # 更新最近抓取时间
- self.updateLatestAccountTimeStamp(gh_id=gh_id)
- print("账号时间更新成功")
- else:
- print("No more data")
- if __name__ == "__main__":
- wxCategory = weixinCategory()
- category_list = [
- '军事',
- '历史',
- # '娱乐八卦',
- # '情感生活',
- # '健康养生',
- # '新闻媒体'
- ]
- for category in category_list:
- account_list = wxCategory.getAccountList(category)
- for account in tqdm(account_list):
- try:
- gh_id = account['gh_id']
- category = account['category']
- try:
- timestamp = int(account['latest_timestamp'].timestamp())
- except Exception as e:
- timestamp = 1704038400
- wxCategory.updateEachAccountArticles(
- gh_id=gh_id,
- category=category,
- latest_time_stamp=timestamp
- )
- print("success")
- except Exception as e:
- print("fail because of {}".format(e))
|