""" @author: luojunhui """ import time import datetime from applications import WeixinSpider, Functions, PQMySQL, DeNetMysql class SpiderTools(object): """ 长文爬虫公共入口 """ spider_client = WeixinSpider() function = Functions() pq_mysql_client = PQMySQL() denet_mysql_client = DeNetMysql() @classmethod def searchEachAccountArticlesSinglePage(cls, gh_id, category): """ 抓取账号单页 :param gh_id: :param category: :return: """ response = cls.spider_client.update_msg_list(ghId=gh_id, index=None) msg_list = response.get("data", {}).get("data") if msg_list: cls.updateDataIntoMysql( gh_id=gh_id, category=category, mode="account", article_list=msg_list ) cls.updateLatestAccountTimeStamp(gh_id=gh_id) else: print("No more data") @classmethod def searchEachAccountArticlesAllData(cls, gh_id, category, latest_time_stamp, index=None): """ 抓取账号截止到2024-01-01的最新数据 :param index: :param gh_id: :param category: :param latest_time_stamp :return: """ response = cls.spider_client.update_msg_list(ghId=gh_id, index=index) msg_list = response.get("data", {}).get("data") if msg_list: last_article_in_this_msg = msg_list[-1] cls.updateDataIntoMysql( gh_id=gh_id, category=category, article_list=msg_list, mode="account" ) last_time_stamp_in_this_msg = last_article_in_this_msg["AppMsg"]["BaseInfo"]["UpdateTime"] if latest_time_stamp < last_time_stamp_in_this_msg: next_cursor = response["data"]["next_cursor"] return cls.searchEachAccountArticlesAllData( gh_id=gh_id, latest_time_stamp=latest_time_stamp, category=category, index=next_cursor, ) else: # 更新最近抓取时间 cls.updateLatestAccountTimeStamp(gh_id=gh_id) else: print("No more data") @classmethod def updateDataIntoMysql(cls, gh_id, category, mode, article_list): """ 将数据更新到数据库 :return: """ for article_obj in article_list: detail_article_list = article_obj["AppMsg"]["DetailInfo"] for obj in detail_article_list: try: show_stat = cls.function.show_desc_to_sta(obj["ShowDesc"]) show_view_count = show_stat.get("show_view_count", 0) show_like_count = show_stat.get("show_like_count", 0) insert_sql = f""" insert into crawler_meta_article (platform, mode, category, out_account_id, article_index, title, link, read_cnt, like_cnt, description, publish_time, crawler_time, status, unique_index) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s); """ cls.denet_mysql_client.update( sql=insert_sql, params=( "weixin", mode, category, gh_id, obj['ItemIndex'], obj["Title"], obj["ContentUrl"], show_view_count, show_like_count, obj["Digest"], obj["send_time"], int(time.time()), 1, cls.function.generateGzhId(obj["ContentUrl"]), ), ) except Exception as e: print(e) @classmethod def updateLatestAccountTimeStamp(cls, gh_id): """ 更新账号的最新时间戳 :return: """ select_sql = f""" SELECT publish_time From crawler_meta_article WHERE out_account_id = '{gh_id}' ORDER BY publish_time DESC LIMIT 1; """ result = cls.denet_mysql_client.select(select_sql) time_stamp = result[0][0] dt_object = datetime.datetime.utcfromtimestamp(time_stamp) local_dt = dt_object.astimezone() dt_string = local_dt.strftime('%Y-%m-%d %H:%M:%S') update_sql = f""" update long_articles_accounts set latest_update_time = %s where account_id = %s; """ cls.pq_mysql_client.update(sql=update_sql, params=(dt_string, gh_id))