luojunhui
/
LongArticlesJob


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235
							"""
@author: luojunhui
获取微信相关账号文章下载
"""

import datetime
import json
import time

from pandas import DataFrame

from applications import PQMySQL, WeixinSpider, AlgApi, Functions
from config import accountBaseInfo


class weixinRelationAccountGoodArticles(object):
    """
    优质账号抓取
    """

    pq_mysql_client = PQMySQL()
    wx_spider = WeixinSpider()
    function = Functions()
    spider_client = PQMySQL()
    nlp = AlgApi()

    @classmethod
    def findInnerAccount(cls):
        """
        找出站内的账号
        :return:
        """
        id_set = set()
        for key in accountBaseInfo:
            gh_id = key[:-2]
            id_set.add(gh_id)
        return list(id_set)

    @classmethod
    def initAccount(cls, gh_id, account_name):
        """
        初始化账号
        :param gh_id:
        :param account_name:
        :return:
        """
        for index in [i for i in range(1, 9)]:
            insert_sql = f"""
                INSERT INTO long_articles_accounts
                (gh_id, account_source, account_name, account_position, account_category, whether_inner_account, is_using)
                values 
                (%s, %s, %s, %s, %s, %s, %s);
            """
            try:
                cls.pq_mysql_client.update(
                    sql=insert_sql,
                    params=(gh_id, "weixin", account_name, index, "association", 0, 1),
                )
            except Exception as e:
                print(e)
        print("账号初始化完成")

    @classmethod
    def putIntoAssociationGraph(cls, gh_id, account_name, source_title, source_account):
        """
        将账号加入到联想表中
        :param gh_id: 联想账号id
        :param account_name: 联想账号名称
        :param source_title: 源标题
        :param source_account: 源账号
        :return:
        """
        insert_sql = f"""
            INSERT INTO long_articles_assiciation_accounts
            (account_outside_id, account_name, source_article_title, source_account, association_time, is_using)
            values 
            (%s, %s, %s, %s, %s, %s);
        """
        try:
            cls.pq_mysql_client.update(
                sql=insert_sql,
                params=(
                    gh_id,
                    account_name,
                    source_title,
                    source_account,
                    datetime.datetime.now().__str__(),
                    1,
                ),
            )
        except Exception as e:
            print(e)

    @classmethod
    def getEachAccountArticle(cls, account_id):
        """
        获取每个账号的好文章
        :return:
        """
        select_sql = f"""
                SELECT title, Type, updateTime, ItemIndex, show_view_count
                FROM official_articles_v2
                WHERE ghId = '{account_id}';
            """
        result = cls.pq_mysql_client.select(select_sql)
        return DataFrame(
            result,
            columns=["title", "Type", "updateTime", "ItemIndex", "show_view_count"],
        )

    @classmethod
    def filterGoodArticle(cls, article_data_frame):
        """
        获取好的文章
        :param article_data_frame:
        :return:
        """
        avg_view = article_data_frame["show_view_count"].mean()
        good_articles = article_data_frame[
            (article_data_frame["show_view_count"]) > avg_view * 1.1
        ]
        return good_articles["title"].values.tolist()

    @classmethod
    def searchGoodArticlesAccounts(cls, source_account, source_title, base_score=None):
        """
        通过标题搜索文章
        :return:
        """
        response = cls.wx_spider.search_articles(source_title)
        article_list = response["data"]["data"]
        if article_list:
            title_list = [i["title"] for i in article_list]
            title_score_list = cls.nlp.getScoreList(
                accountName=source_account, title_list=title_list
            )[source_account]["score_list"]
            account_list = []
            for index, score in enumerate(title_score_list):
                # if score > base_score:
                article_obj = article_list[index]
                account_info = cls.wx_spider.get_account_by_url(
                    content_url=article_obj["url"]
                )
                obj = [article_obj["title"], account_info]
                account_list.append(obj)
            return account_list
        else:
            return []

    @classmethod
    def insertIntoDataBase(cls, gh_id, article_list):
        """
        将数据插入数据库
        :return:
        """
        for article_obj in article_list:
            detail_article_list = article_obj["AppMsg"]["DetailInfo"]
            for obj in detail_article_list:
                try:
                    show_stat = cls.function.show_desc_to_sta(obj["ShowDesc"])
                    show_view_count = show_stat.get("show_view_count", 0)
                    show_like_count = show_stat.get("show_like_count", 0)
                    insert_sql = f"""
                                insert into crawler_meta_article
                                (platform, mode, category, out_account_id, title, link, read_cnt, like_cnt, description, publish_time, crawler_time, status, unique_index)
                                VALUES 
                                (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
                            """
                    cls.spider_client.update(
                        sql=insert_sql,
                        params=(
                            "weixin",
                            "account",
                            "association",
                            gh_id,
                            obj["Title"],
                            obj["ContentUrl"],
                            show_view_count,
                            show_like_count,
                            obj["Digest"],
                            obj["send_time"],
                            int(time.time()),
                            1,
                            cls.function.generateGzhId(obj["ContentUrl"]),
                        ),
                    )
                except Exception as e:
                    print(e)

    # @classmethod
    # def searchResultFilter(cls, filter_type, info):
    #     """
    #     搜索结果过滤
    #     :param info: 待过滤信息
    #     :param filter_type: 过滤类型，account表示账号过滤， article表示文章过滤
    #     :return: 过滤后的结果
    #     """
    #     match filter_type:
    #         case "account":
    #             return account


if __name__ == "__main__":
    weixin = weixinRelationAccountGoodArticles()
    # 获取内部账号
    inner_account_list = weixin.findInnerAccount()
    for source_account in inner_account_list[:1]:
        accountArticlesDataFrame = weixin.getEachAccountArticle(
            account_id=source_account
        )
        goodArticles = weixin.filterGoodArticle(accountArticlesDataFrame)
        for title in goodArticles:
            account_list = weixin.searchGoodArticlesAccounts(
                source_account=source_account, source_title=title
            )
            print(title)
            print(source_account)
            for associated_account in account_list:
                source_title = associated_account[0]
                associated_account_info = associated_account[1]
                account_name = associated_account_info["data"]["data"]["account_name"]
                gh_id = associated_account_info["data"]["data"]["wx_gh"]
                if '新闻' in account_name:
                    continue
                elif '央视' in account_name:
                    continue
                else:
                    # 初始化账号
                    weixin.initAccount(gh_id=gh_id, account_name=account_name)
                    weixin.putIntoAssociationGraph(
                        gh_id=gh_id,
                        account_name=account_name,
                        source_account=source_account,
                        source_title=title
                    )