|  | @@ -0,0 +1,214 @@
 | 
											
												
													
														|  | 
 |  | +"""
 | 
											
												
													
														|  | 
 |  | +@author: luojunhui
 | 
											
												
													
														|  | 
 |  | +微信账号联想
 | 
											
												
													
														|  | 
 |  | +"""
 | 
											
												
													
														|  | 
 |  | +import datetime
 | 
											
												
													
														|  | 
 |  | +import json
 | 
											
												
													
														|  | 
 |  | +import traceback
 | 
											
												
													
														|  | 
 |  | +from typing import List, Set, Dict
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +from tqdm import tqdm
 | 
											
												
													
														|  | 
 |  | +from pymysql.cursors import DictCursor
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +from applications import WeixinSpider, longArticlesMySQL, log, bot, aiditApi
 | 
											
												
													
														|  | 
 |  | +from applications.const import WeixinVideoCrawlerConst
 | 
											
												
													
														|  | 
 |  | +from applications.functions import Functions
 | 
											
												
													
														|  | 
 |  | +from config import apolloConfig
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +const = WeixinVideoCrawlerConst()
 | 
											
												
													
														|  | 
 |  | +function = Functions()
 | 
											
												
													
														|  | 
 |  | +config = apolloConfig()
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +def get_inner_account_gh_id() -> Set[str]:
 | 
											
												
													
														|  | 
 |  | +    """
 | 
											
												
													
														|  | 
 |  | +    获取内部账号名称
 | 
											
												
													
														|  | 
 |  | +    :return:
 | 
											
												
													
														|  | 
 |  | +    """
 | 
											
												
													
														|  | 
 |  | +    accounts = aiditApi.get_publish_account_from_aigc()
 | 
											
												
													
														|  | 
 |  | +    gh_id_list = [i['ghId'] for i in accounts]
 | 
											
												
													
														|  | 
 |  | +    return set(gh_id_list)
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +class AccountAssociationCrawler(object):
 | 
											
												
													
														|  | 
 |  | +    """
 | 
											
												
													
														|  | 
 |  | +    账号抓取
 | 
											
												
													
														|  | 
 |  | +    """
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    def __init__(self):
 | 
											
												
													
														|  | 
 |  | +        self.db_client = longArticlesMySQL()
 | 
											
												
													
														|  | 
 |  | +        self.spider = WeixinSpider()
 | 
											
												
													
														|  | 
 |  | +        self.account_name_filter = json.loads(config.getConfigValue('account_name_filter'))
 | 
											
												
													
														|  | 
 |  | +        self.crawler_account_count = 0
 | 
											
												
													
														|  | 
 |  | +        self.total_crawler_count = 0
 | 
											
												
													
														|  | 
 |  | +        self.inner_account_count = 0
 | 
											
												
													
														|  | 
 |  | +        self.account_name_filter_count = 0
 | 
											
												
													
														|  | 
 |  | +        self.already_crawler_account_count = 0
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    def is_bad_account(self, account_name: str) -> bool:
 | 
											
												
													
														|  | 
 |  | +        """
 | 
											
												
													
														|  | 
 |  | +        判断账号是否为bad account
 | 
											
												
													
														|  | 
 |  | +        :param account_name:
 | 
											
												
													
														|  | 
 |  | +        :return:
 | 
											
												
													
														|  | 
 |  | +        """
 | 
											
												
													
														|  | 
 |  | +        if account_name == "":
 | 
											
												
													
														|  | 
 |  | +            return True
 | 
											
												
													
														|  | 
 |  | +        for key in self.account_name_filter:
 | 
											
												
													
														|  | 
 |  | +            if key in account_name:
 | 
											
												
													
														|  | 
 |  | +                return True
 | 
											
												
													
														|  | 
 |  | +        return False
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    def get_seed_titles(self, run_date: datetime) -> List[Dict]:
 | 
											
												
													
														|  | 
 |  | +        """
 | 
											
												
													
														|  | 
 |  | +        :return:
 | 
											
												
													
														|  | 
 |  | +        """
 | 
											
												
													
														|  | 
 |  | +        publish_timestamp_threshold = int(run_date.timestamp()) - const.STAT_PERIOD
 | 
											
												
													
														|  | 
 |  | +        sql = f"""
 | 
											
												
													
														|  | 
 |  | +            SELECT DISTINCT t1.account_name, t1.title, t2.kimi_summary, t2.kimi_keys
 | 
											
												
													
														|  | 
 |  | +            FROM datastat_sort_strategy t1
 | 
											
												
													
														|  | 
 |  | +            JOIN long_articles_text t2
 | 
											
												
													
														|  | 
 |  | +            ON t1.source_id = t2.content_id
 | 
											
												
													
														|  | 
 |  | +            WHERE t1.read_rate > {const.READ_AVG_MULTIPLE} 
 | 
											
												
													
														|  | 
 |  | +                AND t1.view_count > {const.MIN_READ_COUNT} 
 | 
											
												
													
														|  | 
 |  | +                AND publish_timestamp > {publish_timestamp_threshold}
 | 
											
												
													
														|  | 
 |  | +            ORDER BY read_rate DESC
 | 
											
												
													
														|  | 
 |  | +            LIMIT 100;
 | 
											
												
													
														|  | 
 |  | +        """
 | 
											
												
													
														|  | 
 |  | +        article_obj_list = self.db_client.select(sql, cursor_type=DictCursor)
 | 
											
												
													
														|  | 
 |  | +        return article_obj_list
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    def search_account_in_weixin(self, article_obj: Dict) -> Dict:
 | 
											
												
													
														|  | 
 |  | +        """
 | 
											
												
													
														|  | 
 |  | +        通过文章信息使用搜索接口搜索账号
 | 
											
												
													
														|  | 
 |  | +        :param article_obj:
 | 
											
												
													
														|  | 
 |  | +        :return:
 | 
											
												
													
														|  | 
 |  | +        """
 | 
											
												
													
														|  | 
 |  | +        ori_title = article_obj['title']
 | 
											
												
													
														|  | 
 |  | +        summary = article_obj['kimi_summary']
 | 
											
												
													
														|  | 
 |  | +        kimi_keys = json.loads(article_obj['kimi_keys']) if article_obj['kimi_keys'] else None
 | 
											
												
													
														|  | 
 |  | +        response_1 = self.spider.search_articles(title=ori_title)
 | 
											
												
													
														|  | 
 |  | +        response_2 = self.spider.search_articles(title=summary) if summary else {}
 | 
											
												
													
														|  | 
 |  | +        response_3 = self.spider.search_articles(title=", ".join(kimi_keys)) if kimi_keys else {}
 | 
											
												
													
														|  | 
 |  | +        response = {
 | 
											
												
													
														|  | 
 |  | +            "title": response_1,
 | 
											
												
													
														|  | 
 |  | +            "summary": response_2,
 | 
											
												
													
														|  | 
 |  | +            "kimi_keys": response_3
 | 
											
												
													
														|  | 
 |  | +        }
 | 
											
												
													
														|  | 
 |  | +        return response
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    def insert_account_into_database(self, account_name: str, gh_id: str, category: str, biz_date: str) -> int:
 | 
											
												
													
														|  | 
 |  | +        """
 | 
											
												
													
														|  | 
 |  | +        :param biz_date:
 | 
											
												
													
														|  | 
 |  | +        :param category:
 | 
											
												
													
														|  | 
 |  | +        :param account_name:
 | 
											
												
													
														|  | 
 |  | +        :param gh_id:
 | 
											
												
													
														|  | 
 |  | +        :return:
 | 
											
												
													
														|  | 
 |  | +        """
 | 
											
												
													
														|  | 
 |  | +        insert_sql = f"""
 | 
											
												
													
														|  | 
 |  | +            INSERT INTO long_articles_accounts
 | 
											
												
													
														|  | 
 |  | +            (gh_id, account_source, account_name, account_category, init_date)
 | 
											
												
													
														|  | 
 |  | +            values 
 | 
											
												
													
														|  | 
 |  | +            (%s, %s, %s, %s, %s)
 | 
											
												
													
														|  | 
 |  | +        """
 | 
											
												
													
														|  | 
 |  | +        affected_rows = self.db_client.update(
 | 
											
												
													
														|  | 
 |  | +            sql=insert_sql,
 | 
											
												
													
														|  | 
 |  | +            params=(gh_id, "weixin", account_name, category, biz_date)
 | 
											
												
													
														|  | 
 |  | +        )
 | 
											
												
													
														|  | 
 |  | +        return affected_rows
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    def save_account_into_db(self, search_response: Dict, inner_account_gh_id_set: Set, biz_date: str) -> None:
 | 
											
												
													
														|  | 
 |  | +        """
 | 
											
												
													
														|  | 
 |  | +        保存账号信息
 | 
											
												
													
														|  | 
 |  | +        :param biz_date:
 | 
											
												
													
														|  | 
 |  | +        :param search_response:
 | 
											
												
													
														|  | 
 |  | +        :param inner_account_gh_id_set:
 | 
											
												
													
														|  | 
 |  | +        :return:
 | 
											
												
													
														|  | 
 |  | +        """
 | 
											
												
													
														|  | 
 |  | +        for key in search_response:
 | 
											
												
													
														|  | 
 |  | +            value = search_response[key]
 | 
											
												
													
														|  | 
 |  | +            if value:
 | 
											
												
													
														|  | 
 |  | +                search_article_list = value['data']['data']
 | 
											
												
													
														|  | 
 |  | +                for article in tqdm(search_article_list):
 | 
											
												
													
														|  | 
 |  | +                    article_url = article['url']
 | 
											
												
													
														|  | 
 |  | +                    try:
 | 
											
												
													
														|  | 
 |  | +                        account_info = self.spider.get_account_by_url(article_url)
 | 
											
												
													
														|  | 
 |  | +                        self.total_crawler_count += 1
 | 
											
												
													
														|  | 
 |  | +                        account_name = account_info['data']['data']['account_name']
 | 
											
												
													
														|  | 
 |  | +                        gh_id = account_info['data']['data']['wx_gh']
 | 
											
												
													
														|  | 
 |  | +                        # 过滤内部账号
 | 
											
												
													
														|  | 
 |  | +                        if gh_id in inner_account_gh_id_set:
 | 
											
												
													
														|  | 
 |  | +                            self.inner_account_count += 1
 | 
											
												
													
														|  | 
 |  | +                            continue
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +                        # 通过账号名称过滤一些bad_account or dangerous account
 | 
											
												
													
														|  | 
 |  | +                        if self.is_bad_account(account_name):
 | 
											
												
													
														|  | 
 |  | +                            self.account_name_filter_count += 1
 | 
											
												
													
														|  | 
 |  | +                            continue
 | 
											
												
													
														|  | 
 |  | +                        try:
 | 
											
												
													
														|  | 
 |  | +                            self.insert_account_into_database(
 | 
											
												
													
														|  | 
 |  | +                                account_name=account_name,
 | 
											
												
													
														|  | 
 |  | +                                gh_id=gh_id,
 | 
											
												
													
														|  | 
 |  | +                                category="account_association",
 | 
											
												
													
														|  | 
 |  | +                                biz_date=biz_date
 | 
											
												
													
														|  | 
 |  | +                            )
 | 
											
												
													
														|  | 
 |  | +                        except Exception as e:
 | 
											
												
													
														|  | 
 |  | +                            self.already_crawler_account_count += 1
 | 
											
												
													
														|  | 
 |  | +                            print(e)
 | 
											
												
													
														|  | 
 |  | +                            continue
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +                        self.crawler_account_count += 1
 | 
											
												
													
														|  | 
 |  | +                    except Exception as e:
 | 
											
												
													
														|  | 
 |  | +                        log(
 | 
											
												
													
														|  | 
 |  | +                            task="account_association",
 | 
											
												
													
														|  | 
 |  | +                            function="save_account_into_db",
 | 
											
												
													
														|  | 
 |  | +                            data={
 | 
											
												
													
														|  | 
 |  | +                                "biz_date": biz_date,
 | 
											
												
													
														|  | 
 |  | +                                "article": article,
 | 
											
												
													
														|  | 
 |  | +                                "trace_back": traceback.format_exc(),
 | 
											
												
													
														|  | 
 |  | +                                "error": f"{e}"
 | 
											
												
													
														|  | 
 |  | +                            }
 | 
											
												
													
														|  | 
 |  | +                        )
 | 
											
												
													
														|  | 
 |  | +                        continue
 | 
											
												
													
														|  | 
 |  | +            else:
 | 
											
												
													
														|  | 
 |  | +                continue
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    def run_account_association(self, biz_date: datetime):
 | 
											
												
													
														|  | 
 |  | +        """
 | 
											
												
													
														|  | 
 |  | +        执行账号联想
 | 
											
												
													
														|  | 
 |  | +        :param biz_date:
 | 
											
												
													
														|  | 
 |  | +        :return:
 | 
											
												
													
														|  | 
 |  | +        """
 | 
											
												
													
														|  | 
 |  | +        inner_account_gh_id_set = get_inner_account_gh_id()
 | 
											
												
													
														|  | 
 |  | +        seed_articles = self.get_seed_titles(biz_date)
 | 
											
												
													
														|  | 
 |  | +        for article in tqdm(seed_articles):
 | 
											
												
													
														|  | 
 |  | +            try:
 | 
											
												
													
														|  | 
 |  | +                # search from weixin
 | 
											
												
													
														|  | 
 |  | +                search_response = self.search_account_in_weixin(article)
 | 
											
												
													
														|  | 
 |  | +                # save
 | 
											
												
													
														|  | 
 |  | +                self.save_account_into_db(
 | 
											
												
													
														|  | 
 |  | +                    search_response=search_response,
 | 
											
												
													
														|  | 
 |  | +                    inner_account_gh_id_set=inner_account_gh_id_set,
 | 
											
												
													
														|  | 
 |  | +                    biz_date=biz_date.strftime("%Y-%m-%d")
 | 
											
												
													
														|  | 
 |  | +                )
 | 
											
												
													
														|  | 
 |  | +            except Exception as e:
 | 
											
												
													
														|  | 
 |  | +                log(
 | 
											
												
													
														|  | 
 |  | +                    task="account_association",
 | 
											
												
													
														|  | 
 |  | +                    function="run_account_association",
 | 
											
												
													
														|  | 
 |  | +                    data={
 | 
											
												
													
														|  | 
 |  | +                        "biz_date": biz_date,
 | 
											
												
													
														|  | 
 |  | +                        "article": article,
 | 
											
												
													
														|  | 
 |  | +                        "trace_back": traceback.format_exc(),
 | 
											
												
													
														|  | 
 |  | +                        "error": f"{e}"
 | 
											
												
													
														|  | 
 |  | +                    }
 | 
											
												
													
														|  | 
 |  | +                )
 | 
											
												
													
														|  | 
 |  | +        bot(
 | 
											
												
													
														|  | 
 |  | +            title="账号联想-账号抓取完成",
 | 
											
												
													
														|  | 
 |  | +            detail={
 | 
											
												
													
														|  | 
 |  | +                "总共联想到的账号数": self.total_crawler_count,
 | 
											
												
													
														|  | 
 |  | +                "内部账号过滤": self.inner_account_count,
 | 
											
												
													
														|  | 
 |  | +                "账号名称过滤": self.account_name_filter_count,
 | 
											
												
													
														|  | 
 |  | +                "已经抓取账号": self.already_crawler_account_count,
 | 
											
												
													
														|  | 
 |  | +                "新增账号": self.crawler_account_count
 | 
											
												
													
														|  | 
 |  | +            }
 | 
											
												
													
														|  | 
 |  | +        )
 | 
											
												
													
														|  | 
 |  | +
 |