|
@@ -1,135 +0,0 @@
|
|
|
-"""
|
|
|
-@author: luojunhui
|
|
|
-"""
|
|
|
-import time
|
|
|
-import datetime
|
|
|
-
|
|
|
-from applications import WeixinSpider, Functions, PQMySQL, DeNetMysql
|
|
|
-
|
|
|
-
|
|
|
-class SpiderTools(object):
|
|
|
- """
|
|
|
- 长文爬虫公共入口
|
|
|
- """
|
|
|
- spider_client = WeixinSpider()
|
|
|
- function = Functions()
|
|
|
- pq_mysql_client = PQMySQL()
|
|
|
- denet_mysql_client = DeNetMysql()
|
|
|
-
|
|
|
- @classmethod
|
|
|
- def searchEachAccountArticlesSinglePage(cls, gh_id, category):
|
|
|
- """
|
|
|
- 抓取账号单页
|
|
|
- :param gh_id:
|
|
|
- :param category:
|
|
|
- :return:
|
|
|
- """
|
|
|
- response = cls.spider_client.update_msg_list(ghId=gh_id, index=None)
|
|
|
- msg_list = response.get("data", {}).get("data")
|
|
|
- if msg_list:
|
|
|
- cls.updateDataIntoMysql(
|
|
|
- gh_id=gh_id,
|
|
|
- category=category,
|
|
|
- mode="account",
|
|
|
- article_list=msg_list
|
|
|
- )
|
|
|
- cls.updateLatestAccountTimeStamp(gh_id=gh_id)
|
|
|
- else:
|
|
|
- print("No more data")
|
|
|
-
|
|
|
- @classmethod
|
|
|
- def searchEachAccountArticlesAllData(cls, gh_id, category, latest_time_stamp, index=None):
|
|
|
- """
|
|
|
- 抓取账号截止到2024-01-01的最新数据
|
|
|
- :param index:
|
|
|
- :param gh_id:
|
|
|
- :param category:
|
|
|
- :param latest_time_stamp
|
|
|
- :return:
|
|
|
- """
|
|
|
- response = cls.spider_client.update_msg_list(ghId=gh_id, index=index)
|
|
|
- msg_list = response.get("data", {}).get("data")
|
|
|
- if msg_list:
|
|
|
- last_article_in_this_msg = msg_list[-1]
|
|
|
- cls.updateDataIntoMysql(
|
|
|
- gh_id=gh_id, category=category, article_list=msg_list, mode="account"
|
|
|
- )
|
|
|
- last_time_stamp_in_this_msg = last_article_in_this_msg["AppMsg"]["BaseInfo"]["UpdateTime"]
|
|
|
- if latest_time_stamp < last_time_stamp_in_this_msg:
|
|
|
- next_cursor = response["data"]["next_cursor"]
|
|
|
- return cls.searchEachAccountArticlesAllData(
|
|
|
- gh_id=gh_id,
|
|
|
- latest_time_stamp=latest_time_stamp,
|
|
|
- category=category,
|
|
|
- index=next_cursor,
|
|
|
- )
|
|
|
- else:
|
|
|
- # 更新最近抓取时间
|
|
|
- cls.updateLatestAccountTimeStamp(gh_id=gh_id)
|
|
|
- else:
|
|
|
- print("No more data")
|
|
|
-
|
|
|
- @classmethod
|
|
|
- def updateDataIntoMysql(cls, gh_id, category, mode, article_list):
|
|
|
- """
|
|
|
- 将数据更新到数据库
|
|
|
- :return:
|
|
|
- """
|
|
|
- for article_obj in article_list:
|
|
|
- detail_article_list = article_obj["AppMsg"]["DetailInfo"]
|
|
|
- for obj in detail_article_list:
|
|
|
- try:
|
|
|
- show_stat = cls.function.show_desc_to_sta(obj["ShowDesc"])
|
|
|
- show_view_count = show_stat.get("show_view_count", 0)
|
|
|
- show_like_count = show_stat.get("show_like_count", 0)
|
|
|
- insert_sql = f"""
|
|
|
- insert into crawler_meta_article
|
|
|
- (platform, mode, category, out_account_id, article_index, title, link, read_cnt, like_cnt, description, publish_time, crawler_time, status, unique_index)
|
|
|
- VALUES
|
|
|
- (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
|
|
|
- """
|
|
|
- cls.denet_mysql_client.update(
|
|
|
- sql=insert_sql,
|
|
|
- params=(
|
|
|
- "weixin",
|
|
|
- mode,
|
|
|
- category,
|
|
|
- gh_id,
|
|
|
- obj['ItemIndex'],
|
|
|
- obj["Title"],
|
|
|
- obj["ContentUrl"],
|
|
|
- show_view_count,
|
|
|
- show_like_count,
|
|
|
- obj["Digest"],
|
|
|
- obj["send_time"],
|
|
|
- int(time.time()),
|
|
|
- 1,
|
|
|
- cls.function.generateGzhId(obj["ContentUrl"]),
|
|
|
- ),
|
|
|
- )
|
|
|
- except Exception as e:
|
|
|
- print(e)
|
|
|
-
|
|
|
- @classmethod
|
|
|
- def updateLatestAccountTimeStamp(cls, gh_id):
|
|
|
- """
|
|
|
- 更新账号的最新时间戳
|
|
|
- :return:
|
|
|
- """
|
|
|
- select_sql = f"""
|
|
|
- SELECT publish_time
|
|
|
- From crawler_meta_article
|
|
|
- WHERE out_account_id = '{gh_id}'
|
|
|
- ORDER BY publish_time DESC LIMIT 1;
|
|
|
- """
|
|
|
- result = cls.denet_mysql_client.select(select_sql)
|
|
|
- time_stamp = result[0][0]
|
|
|
- dt_object = datetime.datetime.utcfromtimestamp(time_stamp)
|
|
|
- local_dt = dt_object.astimezone()
|
|
|
- dt_string = local_dt.strftime('%Y-%m-%d %H:%M:%S')
|
|
|
- update_sql = f"""
|
|
|
- update long_articles_accounts
|
|
|
- set latest_update_time = %s
|
|
|
- where account_id = %s;
|
|
|
- """
|
|
|
- cls.pq_mysql_client.update(sql=update_sql, params=(dt_string, gh_id))
|