123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235 |
- """
- @author: luojunhui
- 获取微信相关账号文章下载
- """
- import datetime
- import json
- import time
- from pandas import DataFrame
- from applications import PQMySQL, WeixinSpider, AlgApi, Functions
- from config import accountBaseInfo
- class weixinRelationAccountGoodArticles(object):
- """
- 优质账号抓取
- """
- pq_mysql_client = PQMySQL()
- wx_spider = WeixinSpider()
- function = Functions()
- spider_client = PQMySQL()
- nlp = AlgApi()
- @classmethod
- def findInnerAccount(cls):
- """
- 找出站内的账号
- :return:
- """
- id_set = set()
- for key in accountBaseInfo:
- gh_id = key[:-2]
- id_set.add(gh_id)
- return list(id_set)
- @classmethod
- def initAccount(cls, gh_id, account_name):
- """
- 初始化账号
- :param gh_id:
- :param account_name:
- :return:
- """
- for index in [i for i in range(1, 9)]:
- insert_sql = f"""
- INSERT INTO long_articles_accounts
- (gh_id, account_source, account_name, account_position, account_category, whether_inner_account, is_using)
- values
- (%s, %s, %s, %s, %s, %s, %s);
- """
- try:
- cls.pq_mysql_client.update(
- sql=insert_sql,
- params=(gh_id, "weixin", account_name, index, "association", 0, 1),
- )
- except Exception as e:
- print(e)
- print("账号初始化完成")
- @classmethod
- def putIntoAssociationGraph(cls, gh_id, account_name, source_title, source_account):
- """
- 将账号加入到联想表中
- :param gh_id: 联想账号id
- :param account_name: 联想账号名称
- :param source_title: 源标题
- :param source_account: 源账号
- :return:
- """
- insert_sql = f"""
- INSERT INTO long_articles_assiciation_accounts
- (account_outside_id, account_name, source_article_title, source_account, association_time, is_using)
- values
- (%s, %s, %s, %s, %s, %s);
- """
- try:
- cls.pq_mysql_client.update(
- sql=insert_sql,
- params=(
- gh_id,
- account_name,
- source_title,
- source_account,
- datetime.datetime.now().__str__(),
- 1,
- ),
- )
- except Exception as e:
- print(e)
- @classmethod
- def getEachAccountArticle(cls, account_id):
- """
- 获取每个账号的好文章
- :return:
- """
- select_sql = f"""
- SELECT title, Type, updateTime, ItemIndex, show_view_count
- FROM official_articles_v2
- WHERE ghId = '{account_id}';
- """
- result = cls.pq_mysql_client.select(select_sql)
- return DataFrame(
- result,
- columns=["title", "Type", "updateTime", "ItemIndex", "show_view_count"],
- )
- @classmethod
- def filterGoodArticle(cls, article_data_frame):
- """
- 获取好的文章
- :param article_data_frame:
- :return:
- """
- avg_view = article_data_frame["show_view_count"].mean()
- good_articles = article_data_frame[
- (article_data_frame["show_view_count"]) > avg_view * 1.1
- ]
- return good_articles["title"].values.tolist()
- @classmethod
- def searchGoodArticlesAccounts(cls, source_account, source_title, base_score=None):
- """
- 通过标题搜索文章
- :return:
- """
- response = cls.wx_spider.search_articles(source_title)
- article_list = response["data"]["data"]
- if article_list:
- title_list = [i["title"] for i in article_list]
- title_score_list = cls.nlp.getScoreList(
- accountName=source_account, title_list=title_list
- )[source_account]["score_list"]
- account_list = []
- for index, score in enumerate(title_score_list):
- # if score > base_score:
- article_obj = article_list[index]
- account_info = cls.wx_spider.get_account_by_url(
- content_url=article_obj["url"]
- )
- obj = [article_obj["title"], account_info]
- account_list.append(obj)
- return account_list
- else:
- return []
- @classmethod
- def insertIntoDataBase(cls, gh_id, article_list):
- """
- 将数据插入数据库
- :return:
- """
- for article_obj in article_list:
- detail_article_list = article_obj["AppMsg"]["DetailInfo"]
- for obj in detail_article_list:
- try:
- show_stat = cls.function.show_desc_to_sta(obj["ShowDesc"])
- show_view_count = show_stat.get("show_view_count", 0)
- show_like_count = show_stat.get("show_like_count", 0)
- insert_sql = f"""
- insert into crawler_meta_article
- (platform, mode, category, out_account_id, title, link, read_cnt, like_cnt, description, publish_time, crawler_time, status, unique_index)
- VALUES
- (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
- """
- cls.spider_client.update(
- sql=insert_sql,
- params=(
- "weixin",
- "account",
- "association",
- gh_id,
- obj["Title"],
- obj["ContentUrl"],
- show_view_count,
- show_like_count,
- obj["Digest"],
- obj["send_time"],
- int(time.time()),
- 1,
- cls.function.generateGzhId(obj["ContentUrl"]),
- ),
- )
- except Exception as e:
- print(e)
- # @classmethod
- # def searchResultFilter(cls, filter_type, info):
- # """
- # 搜索结果过滤
- # :param info: 待过滤信息
- # :param filter_type: 过滤类型,account表示账号过滤, article表示文章过滤
- # :return: 过滤后的结果
- # """
- # match filter_type:
- # case "account":
- # return account
- if __name__ == "__main__":
- weixin = weixinRelationAccountGoodArticles()
- # 获取内部账号
- inner_account_list = weixin.findInnerAccount()
- for source_account in inner_account_list[:1]:
- accountArticlesDataFrame = weixin.getEachAccountArticle(
- account_id=source_account
- )
- goodArticles = weixin.filterGoodArticle(accountArticlesDataFrame)
- for title in goodArticles:
- account_list = weixin.searchGoodArticlesAccounts(
- source_account=source_account, source_title=title
- )
- print(title)
- print(source_account)
- for associated_account in account_list:
- source_title = associated_account[0]
- associated_account_info = associated_account[1]
- account_name = associated_account_info["data"]["data"]["account_name"]
- gh_id = associated_account_info["data"]["data"]["wx_gh"]
- if '新闻' in account_name:
- continue
- elif '央视' in account_name:
- continue
- else:
- # 初始化账号
- weixin.initAccount(gh_id=gh_id, account_name=account_name)
- weixin.putIntoAssociationGraph(
- gh_id=gh_id,
- account_name=account_name,
- source_account=source_account,
- source_title=title
- )
|