""" @author: luojunhui 获取微信相关账号文章下载 """ import datetime import json import time from pandas import DataFrame from applications import PQMySQL, WeixinSpider, AlgApi, Functions from config import accountBaseInfo class weixinRelationAccountGoodArticles(object): """ 优质账号抓取 """ pq_mysql_client = PQMySQL() wx_spider = WeixinSpider() function = Functions() spider_client = PQMySQL() nlp = AlgApi() @classmethod def findInnerAccount(cls): """ 找出站内的账号 :return: """ id_set = set() for key in accountBaseInfo: gh_id = key[:-2] id_set.add(gh_id) return list(id_set) @classmethod def initAccount(cls, gh_id, account_name): """ 初始化账号 :param gh_id: :param account_name: :return: """ for index in [i for i in range(1, 9)]: insert_sql = f""" INSERT INTO long_articles_accounts (gh_id, account_source, account_name, account_position, account_category, whether_inner_account, is_using) values (%s, %s, %s, %s, %s, %s, %s); """ try: cls.pq_mysql_client.update( sql=insert_sql, params=(gh_id, "weixin", account_name, index, "association", 0, 1), ) except Exception as e: print(e) print("账号初始化完成") @classmethod def putIntoAssociationGraph(cls, gh_id, account_name, source_title, source_account): """ 将账号加入到联想表中 :param gh_id: 联想账号id :param account_name: 联想账号名称 :param source_title: 源标题 :param source_account: 源账号 :return: """ insert_sql = f""" INSERT INTO long_articles_assiciation_accounts (account_outside_id, account_name, source_article_title, source_account, association_time, is_using) values (%s, %s, %s, %s, %s, %s); """ try: cls.pq_mysql_client.update( sql=insert_sql, params=( gh_id, account_name, source_title, source_account, datetime.datetime.now().__str__(), 1, ), ) except Exception as e: print(e) @classmethod def getEachAccountArticle(cls, account_id): """ 获取每个账号的好文章 :return: """ select_sql = f""" SELECT title, Type, updateTime, ItemIndex, show_view_count FROM official_articles_v2 WHERE ghId = '{account_id}'; """ result = cls.pq_mysql_client.select(select_sql) return DataFrame( result, columns=["title", "Type", "updateTime", "ItemIndex", "show_view_count"], ) @classmethod def filterGoodArticle(cls, article_data_frame): """ 获取好的文章 :param article_data_frame: :return: """ avg_view = article_data_frame["show_view_count"].mean() good_articles = article_data_frame[ (article_data_frame["show_view_count"]) > avg_view * 1.1 ] return good_articles["title"].values.tolist() @classmethod def searchGoodArticlesAccounts(cls, source_account, source_title, base_score=None): """ 通过标题搜索文章 :return: """ response = cls.wx_spider.search_articles(source_title) article_list = response["data"]["data"] if article_list: title_list = [i["title"] for i in article_list] title_score_list = cls.nlp.getScoreList( accountName=source_account, title_list=title_list )[source_account]["score_list"] account_list = [] for index, score in enumerate(title_score_list): # if score > base_score: article_obj = article_list[index] account_info = cls.wx_spider.get_account_by_url( content_url=article_obj["url"] ) obj = [article_obj["title"], account_info] account_list.append(obj) return account_list else: return [] @classmethod def insertIntoDataBase(cls, gh_id, article_list): """ 将数据插入数据库 :return: """ for article_obj in article_list: detail_article_list = article_obj["AppMsg"]["DetailInfo"] for obj in detail_article_list: try: show_stat = cls.function.show_desc_to_sta(obj["ShowDesc"]) show_view_count = show_stat.get("show_view_count", 0) show_like_count = show_stat.get("show_like_count", 0) insert_sql = f""" insert into crawler_meta_article (platform, mode, category, out_account_id, title, link, read_cnt, like_cnt, description, publish_time, crawler_time, status, unique_index) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s); """ cls.spider_client.update( sql=insert_sql, params=( "weixin", "account", "association", gh_id, obj["Title"], obj["ContentUrl"], show_view_count, show_like_count, obj["Digest"], obj["send_time"], int(time.time()), 1, cls.function.generateGzhId(obj["ContentUrl"]), ), ) except Exception as e: print(e) # @classmethod # def searchResultFilter(cls, filter_type, info): # """ # 搜索结果过滤 # :param info: 待过滤信息 # :param filter_type: 过滤类型,account表示账号过滤, article表示文章过滤 # :return: 过滤后的结果 # """ # match filter_type: # case "account": # return account if __name__ == "__main__": weixin = weixinRelationAccountGoodArticles() # 获取内部账号 inner_account_list = weixin.findInnerAccount() for source_account in inner_account_list[:1]: accountArticlesDataFrame = weixin.getEachAccountArticle( account_id=source_account ) goodArticles = weixin.filterGoodArticle(accountArticlesDataFrame) for title in goodArticles: account_list = weixin.searchGoodArticlesAccounts( source_account=source_account, source_title=title ) print(title) print(source_account) for associated_account in account_list: source_title = associated_account[0] associated_account_info = associated_account[1] account_name = associated_account_info["data"]["data"]["account_name"] gh_id = associated_account_info["data"]["data"]["wx_gh"] if '新闻' in account_name: continue elif '央视' in account_name: continue else: # 初始化账号 weixin.initAccount(gh_id=gh_id, account_name=account_name) weixin.putIntoAssociationGraph( gh_id=gh_id, account_name=account_name, source_account=source_account, source_title=title )