123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135 |
- """
- @author: luojunhui
- """
- import time
- import datetime
- from applications import WeixinSpider, Functions, PQMySQL, DeNetMysql
- class SpiderTools(object):
- """
- 长文爬虫公共入口
- """
- spider_client = WeixinSpider()
- function = Functions()
- pq_mysql_client = PQMySQL()
- denet_mysql_client = DeNetMysql()
- @classmethod
- def searchEachAccountArticlesSinglePage(cls, gh_id, category):
- """
- 抓取账号单页
- :param gh_id:
- :param category:
- :return:
- """
- response = cls.spider_client.update_msg_list(ghId=gh_id, index=None)
- msg_list = response.get("data", {}).get("data")
- if msg_list:
- cls.updateDataIntoMysql(
- gh_id=gh_id,
- category=category,
- mode="account",
- article_list=msg_list
- )
- cls.updateLatestAccountTimeStamp(gh_id=gh_id)
- else:
- print("No more data")
- @classmethod
- def searchEachAccountArticlesAllData(cls, gh_id, category, latest_time_stamp, index=None):
- """
- 抓取账号截止到2024-01-01的最新数据
- :param index:
- :param gh_id:
- :param category:
- :param latest_time_stamp
- :return:
- """
- response = cls.spider_client.update_msg_list(ghId=gh_id, index=index)
- msg_list = response.get("data", {}).get("data")
- if msg_list:
- last_article_in_this_msg = msg_list[-1]
- cls.updateDataIntoMysql(
- gh_id=gh_id, category=category, article_list=msg_list, mode="account"
- )
- last_time_stamp_in_this_msg = last_article_in_this_msg["AppMsg"]["BaseInfo"]["UpdateTime"]
- if latest_time_stamp < last_time_stamp_in_this_msg:
- next_cursor = response["data"]["next_cursor"]
- return cls.searchEachAccountArticlesAllData(
- gh_id=gh_id,
- latest_time_stamp=latest_time_stamp,
- category=category,
- index=next_cursor,
- )
- else:
- # 更新最近抓取时间
- cls.updateLatestAccountTimeStamp(gh_id=gh_id)
- else:
- print("No more data")
- @classmethod
- def updateDataIntoMysql(cls, gh_id, category, mode, article_list):
- """
- 将数据更新到数据库
- :return:
- """
- for article_obj in article_list:
- detail_article_list = article_obj["AppMsg"]["DetailInfo"]
- for obj in detail_article_list:
- try:
- show_stat = cls.function.show_desc_to_sta(obj["ShowDesc"])
- show_view_count = show_stat.get("show_view_count", 0)
- show_like_count = show_stat.get("show_like_count", 0)
- insert_sql = f"""
- insert into crawler_meta_article
- (platform, mode, category, out_account_id, article_index, title, link, read_cnt, like_cnt, description, publish_time, crawler_time, status, unique_index)
- VALUES
- (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
- """
- cls.denet_mysql_client.update(
- sql=insert_sql,
- params=(
- "weixin",
- mode,
- category,
- gh_id,
- obj['ItemIndex'],
- obj["Title"],
- obj["ContentUrl"],
- show_view_count,
- show_like_count,
- obj["Digest"],
- obj["send_time"],
- int(time.time()),
- 1,
- cls.function.generateGzhId(obj["ContentUrl"]),
- ),
- )
- except Exception as e:
- print(e)
- @classmethod
- def updateLatestAccountTimeStamp(cls, gh_id):
- """
- 更新账号的最新时间戳
- :return:
- """
- select_sql = f"""
- SELECT publish_time
- From crawler_meta_article
- WHERE out_account_id = '{gh_id}'
- ORDER BY publish_time DESC LIMIT 1;
- """
- result = cls.denet_mysql_client.select(select_sql)
- time_stamp = result[0][0]
- dt_object = datetime.datetime.utcfromtimestamp(time_stamp)
- local_dt = dt_object.astimezone()
- dt_string = local_dt.strftime('%Y-%m-%d %H:%M:%S')
- update_sql = f"""
- update long_articles_accounts
- set latest_update_time = %s
- where account_id = %s;
- """
- cls.pq_mysql_client.update(sql=update_sql, params=(dt_string, gh_id))
|