luojunhui
/
LongArticlesJob


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248
							"""
@author: luojunhui
微信账号联想
"""
import datetime
import json
import traceback
from typing import List, Set, Dict

from tqdm import tqdm
from pymysql.cursors import DictCursor

from applications import aiditApi
from applications import bot
from applications import log
from applications import longArticlesMySQL
from applications import WeixinSpider
from applications.const import AccountAssociationTaskConst
from applications.functions import Functions
from config import apolloConfig

const = AccountAssociationTaskConst()
function = Functions()
config = apolloConfig()
empty_dict = {}


def get_inner_account_gh_id() -> Set[str]:
    """
    获取内部账号名称
    :return:
    """
    accounts = aiditApi.get_publish_account_from_aigc()
    gh_id_list = [i['ghId'] for i in accounts]
    return set(gh_id_list)


class AccountAssociationCrawler(object):
    """
    账号抓取
    """

    def __init__(self):
        self.db_client = longArticlesMySQL()
        self.spider = WeixinSpider()
        self.account_name_filter = json.loads(config.getConfigValue('account_name_filter'))
        self.crawler_account_count = 0
        self.total_crawler_count = 0
        self.inner_account_count = 0
        self.account_name_filter_count = 0
        self.already_crawler_account_count = 0
        self.official_accounts = 0

    def is_bad_account(self, account_name: str) -> bool:
        """
        判断账号是否为bad account
        :param account_name:
        :return:
        """
        if account_name == "":
            return True
        for key in self.account_name_filter:
            if key in account_name:
                return True
        return False

    def is_account_official(self, gh_id: str) -> bool:
        """
        判断账号是否为官方账号
        :param gh_id:
        :return: True or False
        """
        response = self.spider.update_msg_list(ghId=gh_id, index=None)
        article_list = response['data']['data']
        published_articles_send_date = []
        for item in article_list:
            if item.get("AppMsg", empty_dict).get("BaseInfo", empty_dict).get("Type") == 9:
                # 获取群发头条的send_time
                send_time = item['AppMsg']['DetailInfo'][0]['send_time']
                send_date = datetime.datetime.fromtimestamp(send_time).strftime('%Y-%m-%d')
                published_articles_send_date.append(send_date)

        published_articles_send_date_set = set(published_articles_send_date)
        if len(published_articles_send_date_set) == len(published_articles_send_date):
            return False
        else:
            return True
 
    def get_seed_titles(self, run_date: datetime) -> List[Dict]:
        """
        :return:
        """
        publish_timestamp_threshold = int(run_date.timestamp()) - const.STAT_PERIOD
        sql = f"""
            SELECT DISTINCT t1.account_name, t1.title, t2.kimi_summary, t2.kimi_keys
            FROM datastat_sort_strategy t1
            JOIN long_articles_text t2
            ON t1.source_id = t2.content_id
            WHERE t1.read_rate > {const.READ_AVG_MULTIPLE} 
                AND t1.view_count > {const.MIN_READ_COUNT} 
                AND publish_timestamp > {publish_timestamp_threshold}
            ORDER BY read_rate DESC
            LIMIT {const.SEED_TITLE_LIMIT};
        """
        article_obj_list = self.db_client.select(sql, cursor_type=DictCursor)
        return article_obj_list

    def search_account_in_weixin(self, article_obj: Dict) -> Dict:
        """
        通过文章信息使用搜索接口搜索账号
        :param article_obj:
        :return:
        """
        ori_title = article_obj['title']
        summary = article_obj['kimi_summary']
        kimi_keys = json.loads(article_obj['kimi_keys']) if article_obj['kimi_keys'] else None
        response_1 = self.spider.search_articles(title=ori_title)
        response_2 = self.spider.search_articles(title=summary) if summary else {}
        response_3 = self.spider.search_articles(title="， ".join(kimi_keys)) if kimi_keys else {}
        response = {
            "title": response_1,
            "summary": response_2,
            "kimi_keys": response_3
        }
        return response

    def insert_account_into_database(self, account_name: str, gh_id: str, category: str, biz_date: str) -> int:
        """
        :param biz_date:
        :param category:
        :param account_name:
        :param gh_id:
        :return:
        """
        insert_sql = f"""
            INSERT INTO long_articles_accounts
            (gh_id, account_source, account_name, account_category, init_date)
            values 
            (%s, %s, %s, %s, %s)
        """
        affected_rows = self.db_client.update(
            sql=insert_sql,
            params=(gh_id, "weixin", account_name, category, biz_date)
        )
        return affected_rows

    def save_account_into_db(self, search_response: Dict, inner_account_gh_id_set: Set, biz_date: str) -> None:
        """
        保存账号信息
        :param biz_date:
        :param search_response:
        :param inner_account_gh_id_set:
        :return:
        """
        for key in search_response:
            value = search_response[key]
            if value:
                search_article_list = value['data']['data']
                for article in tqdm(search_article_list):
                    article_url = article['url']
                    try:
                        account_info = self.spider.get_account_by_url(article_url)
                        self.total_crawler_count += 1
                        account_name = account_info['data']['data']['account_name']
                        gh_id = account_info['data']['data']['wx_gh']
                        # 过滤内部账号
                        if gh_id in inner_account_gh_id_set:
                            self.inner_account_count += 1
                            continue

                        # 通过账号名称过滤一些bad_account or dangerous account
                        if self.is_bad_account(account_name):
                            self.account_name_filter_count += 1
                            continue

                        # 判断账号是否为官方账号
                        if self.is_account_official(gh_id):
                            self.official_accounts += 1
                            continue

                        try:
                            self.insert_account_into_database(
                                account_name=account_name,
                                gh_id=gh_id,
                                category="account_association",
                                biz_date=biz_date
                            )
                        except Exception as e:
                            self.already_crawler_account_count += 1
                            print(e)
                            continue

                        self.crawler_account_count += 1
                    except Exception as e:
                        log(
                            task="account_association",
                            function="save_account_into_db",
                            data={
                                "biz_date": biz_date,
                                "article": article,
                                "trace_back": traceback.format_exc(),
                                "error": f"{e}"
                            }
                        )
                        continue
            else:
                continue

    def run_account_association(self, biz_date: datetime):
        """
        执行账号联想
        :param biz_date:
        :return:
        """
        inner_account_gh_id_set = get_inner_account_gh_id()
        seed_articles = self.get_seed_titles(biz_date)
        for article in tqdm(seed_articles):
            try:
                # search from weixin
                search_response = self.search_account_in_weixin(article)
                # save
                self.save_account_into_db(
                    search_response=search_response,
                    inner_account_gh_id_set=inner_account_gh_id_set,
                    biz_date=biz_date.strftime("%Y-%m-%d")
                )
            except Exception as e:
                log(
                    task="account_association",
                    function="run_account_association",
                    data={
                        "biz_date": biz_date.strftime("%Y-%m-%d"),
                        "article": article,
                        "trace_back": traceback.format_exc(),
                        "error": f"{e}"
                    }
                )
        bot(
            title="账号联想-账号抓取完成",
            detail={
                "总共联想到的账号数": self.total_crawler_count,
                "内部账号过滤": self.inner_account_count,
                "账号名称过滤": self.account_name_filter_count,
                "官方账号过滤": self.official_accounts,
                "已经抓取账号": self.already_crawler_account_count,
                "新增账号": self.crawler_account_count
            }
        )