|
@@ -1,235 +0,0 @@
|
|
|
-"""
|
|
|
-@author: luojunhui
|
|
|
-获取微信相关账号文章下载
|
|
|
-"""
|
|
|
-
|
|
|
-import datetime
|
|
|
-import json
|
|
|
-import time
|
|
|
-
|
|
|
-from pandas import DataFrame
|
|
|
-
|
|
|
-from applications import PQMySQL, WeixinSpider, AlgApi, Functions
|
|
|
-from config import accountBaseInfo
|
|
|
-
|
|
|
-
|
|
|
-class weixinRelationAccountGoodArticles(object):
|
|
|
- """
|
|
|
- 优质账号抓取
|
|
|
- """
|
|
|
-
|
|
|
- pq_mysql_client = PQMySQL()
|
|
|
- wx_spider = WeixinSpider()
|
|
|
- function = Functions()
|
|
|
- spider_client = PQMySQL()
|
|
|
- nlp = AlgApi()
|
|
|
-
|
|
|
- @classmethod
|
|
|
- def findInnerAccount(cls):
|
|
|
- """
|
|
|
- 找出站内的账号
|
|
|
- :return:
|
|
|
- """
|
|
|
- id_set = set()
|
|
|
- for key in accountBaseInfo:
|
|
|
- gh_id = key[:-2]
|
|
|
- id_set.add(gh_id)
|
|
|
- return list(id_set)
|
|
|
-
|
|
|
- @classmethod
|
|
|
- def initAccount(cls, gh_id, account_name):
|
|
|
- """
|
|
|
- 初始化账号
|
|
|
- :param gh_id:
|
|
|
- :param account_name:
|
|
|
- :return:
|
|
|
- """
|
|
|
- for index in [i for i in range(1, 9)]:
|
|
|
- insert_sql = f"""
|
|
|
- INSERT INTO long_articles_accounts
|
|
|
- (gh_id, account_source, account_name, account_position, account_category, whether_inner_account, is_using)
|
|
|
- values
|
|
|
- (%s, %s, %s, %s, %s, %s, %s);
|
|
|
- """
|
|
|
- try:
|
|
|
- cls.pq_mysql_client.update(
|
|
|
- sql=insert_sql,
|
|
|
- params=(gh_id, "weixin", account_name, index, "association", 0, 1),
|
|
|
- )
|
|
|
- except Exception as e:
|
|
|
- print(e)
|
|
|
- print("账号初始化完成")
|
|
|
-
|
|
|
- @classmethod
|
|
|
- def putIntoAssociationGraph(cls, gh_id, account_name, source_title, source_account):
|
|
|
- """
|
|
|
- 将账号加入到联想表中
|
|
|
- :param gh_id: 联想账号id
|
|
|
- :param account_name: 联想账号名称
|
|
|
- :param source_title: 源标题
|
|
|
- :param source_account: 源账号
|
|
|
- :return:
|
|
|
- """
|
|
|
- insert_sql = f"""
|
|
|
- INSERT INTO long_articles_assiciation_accounts
|
|
|
- (account_outside_id, account_name, source_article_title, source_account, association_time, is_using)
|
|
|
- values
|
|
|
- (%s, %s, %s, %s, %s, %s);
|
|
|
- """
|
|
|
- try:
|
|
|
- cls.pq_mysql_client.update(
|
|
|
- sql=insert_sql,
|
|
|
- params=(
|
|
|
- gh_id,
|
|
|
- account_name,
|
|
|
- source_title,
|
|
|
- source_account,
|
|
|
- datetime.datetime.now().__str__(),
|
|
|
- 1,
|
|
|
- ),
|
|
|
- )
|
|
|
- except Exception as e:
|
|
|
- print(e)
|
|
|
-
|
|
|
- @classmethod
|
|
|
- def getEachAccountArticle(cls, account_id):
|
|
|
- """
|
|
|
- 获取每个账号的好文章
|
|
|
- :return:
|
|
|
- """
|
|
|
- select_sql = f"""
|
|
|
- SELECT title, Type, updateTime, ItemIndex, show_view_count
|
|
|
- FROM official_articles_v2
|
|
|
- WHERE ghId = '{account_id}';
|
|
|
- """
|
|
|
- result = cls.pq_mysql_client.select(select_sql)
|
|
|
- return DataFrame(
|
|
|
- result,
|
|
|
- columns=["title", "Type", "updateTime", "ItemIndex", "show_view_count"],
|
|
|
- )
|
|
|
-
|
|
|
- @classmethod
|
|
|
- def filterGoodArticle(cls, article_data_frame):
|
|
|
- """
|
|
|
- 获取好的文章
|
|
|
- :param article_data_frame:
|
|
|
- :return:
|
|
|
- """
|
|
|
- avg_view = article_data_frame["show_view_count"].mean()
|
|
|
- good_articles = article_data_frame[
|
|
|
- (article_data_frame["show_view_count"]) > avg_view * 1.1
|
|
|
- ]
|
|
|
- return good_articles["title"].values.tolist()
|
|
|
-
|
|
|
- @classmethod
|
|
|
- def searchGoodArticlesAccounts(cls, source_account, source_title, base_score=None):
|
|
|
- """
|
|
|
- 通过标题搜索文章
|
|
|
- :return:
|
|
|
- """
|
|
|
- response = cls.wx_spider.search_articles(source_title)
|
|
|
- article_list = response["data"]["data"]
|
|
|
- if article_list:
|
|
|
- title_list = [i["title"] for i in article_list]
|
|
|
- title_score_list = cls.nlp.getScoreList(
|
|
|
- accountName=source_account, title_list=title_list
|
|
|
- )[source_account]["score_list"]
|
|
|
- account_list = []
|
|
|
- for index, score in enumerate(title_score_list):
|
|
|
- # if score > base_score:
|
|
|
- article_obj = article_list[index]
|
|
|
- account_info = cls.wx_spider.get_account_by_url(
|
|
|
- content_url=article_obj["url"]
|
|
|
- )
|
|
|
- obj = [article_obj["title"], account_info]
|
|
|
- account_list.append(obj)
|
|
|
- return account_list
|
|
|
- else:
|
|
|
- return []
|
|
|
-
|
|
|
- @classmethod
|
|
|
- def insertIntoDataBase(cls, gh_id, article_list):
|
|
|
- """
|
|
|
- 将数据插入数据库
|
|
|
- :return:
|
|
|
- """
|
|
|
- for article_obj in article_list:
|
|
|
- detail_article_list = article_obj["AppMsg"]["DetailInfo"]
|
|
|
- for obj in detail_article_list:
|
|
|
- try:
|
|
|
- show_stat = cls.function.show_desc_to_sta(obj["ShowDesc"])
|
|
|
- show_view_count = show_stat.get("show_view_count", 0)
|
|
|
- show_like_count = show_stat.get("show_like_count", 0)
|
|
|
- insert_sql = f"""
|
|
|
- insert into crawler_meta_article
|
|
|
- (platform, mode, category, out_account_id, title, link, read_cnt, like_cnt, description, publish_time, crawler_time, status, unique_index)
|
|
|
- VALUES
|
|
|
- (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
|
|
|
- """
|
|
|
- cls.spider_client.update(
|
|
|
- sql=insert_sql,
|
|
|
- params=(
|
|
|
- "weixin",
|
|
|
- "account",
|
|
|
- "association",
|
|
|
- gh_id,
|
|
|
- obj["Title"],
|
|
|
- obj["ContentUrl"],
|
|
|
- show_view_count,
|
|
|
- show_like_count,
|
|
|
- obj["Digest"],
|
|
|
- obj["send_time"],
|
|
|
- int(time.time()),
|
|
|
- 1,
|
|
|
- cls.function.generateGzhId(obj["ContentUrl"]),
|
|
|
- ),
|
|
|
- )
|
|
|
- except Exception as e:
|
|
|
- print(e)
|
|
|
-
|
|
|
- # @classmethod
|
|
|
- # def searchResultFilter(cls, filter_type, info):
|
|
|
- # """
|
|
|
- # 搜索结果过滤
|
|
|
- # :param info: 待过滤信息
|
|
|
- # :param filter_type: 过滤类型,account表示账号过滤, article表示文章过滤
|
|
|
- # :return: 过滤后的结果
|
|
|
- # """
|
|
|
- # match filter_type:
|
|
|
- # case "account":
|
|
|
- # return account
|
|
|
-
|
|
|
-
|
|
|
-if __name__ == "__main__":
|
|
|
- weixin = weixinRelationAccountGoodArticles()
|
|
|
- # 获取内部账号
|
|
|
- inner_account_list = weixin.findInnerAccount()
|
|
|
- for source_account in inner_account_list[:1]:
|
|
|
- accountArticlesDataFrame = weixin.getEachAccountArticle(
|
|
|
- account_id=source_account
|
|
|
- )
|
|
|
- goodArticles = weixin.filterGoodArticle(accountArticlesDataFrame)
|
|
|
- for title in goodArticles:
|
|
|
- account_list = weixin.searchGoodArticlesAccounts(
|
|
|
- source_account=source_account, source_title=title
|
|
|
- )
|
|
|
- print(title)
|
|
|
- print(source_account)
|
|
|
- for associated_account in account_list:
|
|
|
- source_title = associated_account[0]
|
|
|
- associated_account_info = associated_account[1]
|
|
|
- account_name = associated_account_info["data"]["data"]["account_name"]
|
|
|
- gh_id = associated_account_info["data"]["data"]["wx_gh"]
|
|
|
- if '新闻' in account_name:
|
|
|
- continue
|
|
|
- elif '央视' in account_name:
|
|
|
- continue
|
|
|
- else:
|
|
|
- # 初始化账号
|
|
|
- weixin.initAccount(gh_id=gh_id, account_name=account_name)
|
|
|
- weixin.putIntoAssociationGraph(
|
|
|
- gh_id=gh_id,
|
|
|
- account_name=account_name,
|
|
|
- source_account=source_account,
|
|
|
- source_title=title
|
|
|
- )
|