10 months ago · aa893df7fd
--- a/coldStartTasks/crawler/weixinAssociationCrawler.py
+++ b/coldStartTasks/crawler/weixinAssociationCrawler.py
@@ -1,52 +0,0 @@
 
				-"""
			
 
				-@author: luojunhui
			
 
				-微信联想抓取
			
 
				-"""
			
 
				-import json
			
 
				-
			
 
				-from tqdm import tqdm
			
 
				-
			
 
				-from applications import PQMySQL
			
 
				-from applications.spiderTool import SpiderTools
			
 
				-
			
 
				-
			
 
				-class weixinAssociation(object):
			
 
				-    """
			
 
				-    微信联想方法
			
 
				-    """
			
 
				-    pq_mysql_client = PQMySQL()
			
 
				-    spider_tool = SpiderTools()
			
 
				-
			
 
				-    @classmethod
			
 
				-    def getAssociationAccounts(cls):
			
 
				-        """
			
 
				-        获取已经联想过的账号
			
 
				-        :return:
			
 
				-        """
			
 
				-        select_sql = f"""
			
 
				-        SELECT distinct(gh_id)
			
 
				-        FROM long_articles_accounts
			
 
				-        where is_using = 1 and account_category = 'association';"""
			
 
				-        account_id_tuple = cls.pq_mysql_client.select(select_sql)
			
 
				-        account_id_list = [list(i) for i in account_id_tuple]
			
 
				-        return account_id_list
			
 
				-
			
 
				-    @classmethod
			
 
				-    def deal(cls):
			
 
				-        """
			
 
				-        main function
			
 
				-        :return:
			
 
				-        """
			
 
				-        account_info_list = cls.getAssociationAccounts()
			
 
				-        for line in tqdm(account_info_list[1:]):
			
 
				-            gh_id = line[0]
			
 
				-            cls.spider_tool.searchEachAccountArticlesSinglePage(
			
 
				-                gh_id=gh_id,
			
 
				-                category="association"
			
 
				-            )
			
 
				-
			
 
				-
			
 
				-
			
 
				-
			
 
				-w = weixinAssociation()
			
 
				-w.deal()
			
--- a/coldStartTasks/crawler/weixinRelativeAccountCrawler.py
+++ b/coldStartTasks/crawler/weixinRelativeAccountCrawler.py
@@ -1,235 +0,0 @@
 
				-"""
			
 
				-@author: luojunhui
			
 
				-获取微信相关账号文章下载
			
 
				-"""
			
 
				-
			
 
				-import datetime
			
 
				-import json
			
 
				-import time
			
 
				-
			
 
				-from pandas import DataFrame
			
 
				-
			
 
				-from applications import PQMySQL, WeixinSpider, AlgApi, Functions
			
 
				-from config import accountBaseInfo
			
 
				-
			
 
				-
			
 
				-class weixinRelationAccountGoodArticles(object):
			
 
				-    """
			
 
				-    优质账号抓取
			
 
				-    """
			
 
				-
			
 
				-    pq_mysql_client = PQMySQL()
			
 
				-    wx_spider = WeixinSpider()
			
 
				-    function = Functions()
			
 
				-    spider_client = PQMySQL()
			
 
				-    nlp = AlgApi()
			
 
				-
			
 
				-    @classmethod
			
 
				-    def findInnerAccount(cls):
			
 
				-        """
			
 
				-        找出站内的账号
			
 
				-        :return:
			
 
				-        """
			
 
				-        id_set = set()
			
 
				-        for key in accountBaseInfo:
			
 
				-            gh_id = key[:-2]
			
 
				-            id_set.add(gh_id)
			
 
				-        return list(id_set)
			
 
				-
			
 
				-    @classmethod
			
 
				-    def initAccount(cls, gh_id, account_name):
			
 
				-        """
			
 
				-        初始化账号
			
 
				-        :param gh_id:
			
 
				-        :param account_name:
			
 
				-        :return:
			
 
				-        """
			
 
				-        for index in [i for i in range(1, 9)]:
			
 
				-            insert_sql = f"""
			
 
				-                INSERT INTO long_articles_accounts
			
 
				-                (gh_id, account_source, account_name, account_position, account_category, whether_inner_account, is_using)
			
 
				-                values 
			
 
				-                (%s, %s, %s, %s, %s, %s, %s);
			
 
				-            """
			
 
				-            try:
			
 
				-                cls.pq_mysql_client.update(
			
 
				-                    sql=insert_sql,
			
 
				-                    params=(gh_id, "weixin", account_name, index, "association", 0, 1),
			
 
				-                )
			
 
				-            except Exception as e:
			
 
				-                print(e)
			
 
				-        print("账号初始化完成")
			
 
				-
			
 
				-    @classmethod
			
 
				-    def putIntoAssociationGraph(cls, gh_id, account_name, source_title, source_account):
			
 
				-        """
			
 
				-        将账号加入到联想表中
			
 
				-        :param gh_id: 联想账号id
			
 
				-        :param account_name: 联想账号名称
			
 
				-        :param source_title: 源标题
			
 
				-        :param source_account: 源账号
			
 
				-        :return:
			
 
				-        """
			
 
				-        insert_sql = f"""
			
 
				-            INSERT INTO long_articles_assiciation_accounts
			
 
				-            (account_outside_id, account_name, source_article_title, source_account, association_time, is_using)
			
 
				-            values 
			
 
				-            (%s, %s, %s, %s, %s, %s);
			
 
				-        """
			
 
				-        try:
			
 
				-            cls.pq_mysql_client.update(
			
 
				-                sql=insert_sql,
			
 
				-                params=(
			
 
				-                    gh_id,
			
 
				-                    account_name,
			
 
				-                    source_title,
			
 
				-                    source_account,
			
 
				-                    datetime.datetime.now().__str__(),
			
 
				-                    1,
			
 
				-                ),
			
 
				-            )
			
 
				-        except Exception as e:
			
 
				-            print(e)
			
 
				-
			
 
				-    @classmethod
			
 
				-    def getEachAccountArticle(cls, account_id):
			
 
				-        """
			
 
				-        获取每个账号的好文章
			
 
				-        :return:
			
 
				-        """
			
 
				-        select_sql = f"""
			
 
				-                SELECT title, Type, updateTime, ItemIndex, show_view_count
			
 
				-                FROM official_articles_v2
			
 
				-                WHERE ghId = '{account_id}';
			
 
				-            """
			
 
				-        result = cls.pq_mysql_client.select(select_sql)
			
 
				-        return DataFrame(
			
 
				-            result,
			
 
				-            columns=["title", "Type", "updateTime", "ItemIndex", "show_view_count"],
			
 
				-        )
			
 
				-
			
 
				-    @classmethod
			
 
				-    def filterGoodArticle(cls, article_data_frame):
			
 
				-        """
			
 
				-        获取好的文章
			
 
				-        :param article_data_frame:
			
 
				-        :return:
			
 
				-        """
			
 
				-        avg_view = article_data_frame["show_view_count"].mean()
			
 
				-        good_articles = article_data_frame[
			
 
				-            (article_data_frame["show_view_count"]) > avg_view * 1.1
			
 
				-        ]
			
 
				-        return good_articles["title"].values.tolist()
			
 
				-
			
 
				-    @classmethod
			
 
				-    def searchGoodArticlesAccounts(cls, source_account, source_title, base_score=None):
			
 
				-        """
			
 
				-        通过标题搜索文章
			
 
				-        :return:
			
 
				-        """
			
 
				-        response = cls.wx_spider.search_articles(source_title)
			
 
				-        article_list = response["data"]["data"]
			
 
				-        if article_list:
			
 
				-            title_list = [i["title"] for i in article_list]
			
 
				-            title_score_list = cls.nlp.getScoreList(
			
 
				-                accountName=source_account, title_list=title_list
			
 
				-            )[source_account]["score_list"]
			
 
				-            account_list = []
			
 
				-            for index, score in enumerate(title_score_list):
			
 
				-                # if score > base_score:
			
 
				-                article_obj = article_list[index]
			
 
				-                account_info = cls.wx_spider.get_account_by_url(
			
 
				-                    content_url=article_obj["url"]
			
 
				-                )
			
 
				-                obj = [article_obj["title"], account_info]
			
 
				-                account_list.append(obj)
			
 
				-            return account_list
			
 
				-        else:
			
 
				-            return []
			
 
				-
			
 
				-    @classmethod
			
 
				-    def insertIntoDataBase(cls, gh_id, article_list):
			
 
				-        """
			
 
				-        将数据插入数据库
			
 
				-        :return:
			
 
				-        """
			
 
				-        for article_obj in article_list:
			
 
				-            detail_article_list = article_obj["AppMsg"]["DetailInfo"]
			
 
				-            for obj in detail_article_list:
			
 
				-                try:
			
 
				-                    show_stat = cls.function.show_desc_to_sta(obj["ShowDesc"])
			
 
				-                    show_view_count = show_stat.get("show_view_count", 0)
			
 
				-                    show_like_count = show_stat.get("show_like_count", 0)
			
 
				-                    insert_sql = f"""
			
 
				-                                insert into crawler_meta_article
			
 
				-                                (platform, mode, category, out_account_id, title, link, read_cnt, like_cnt, description, publish_time, crawler_time, status, unique_index)
			
 
				-                                VALUES 
			
 
				-                                (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
			
 
				-                            """
			
 
				-                    cls.spider_client.update(
			
 
				-                        sql=insert_sql,
			
 
				-                        params=(
			
 
				-                            "weixin",
			
 
				-                            "account",
			
 
				-                            "association",
			
 
				-                            gh_id,
			
 
				-                            obj["Title"],
			
 
				-                            obj["ContentUrl"],
			
 
				-                            show_view_count,
			
 
				-                            show_like_count,
			
 
				-                            obj["Digest"],
			
 
				-                            obj["send_time"],
			
 
				-                            int(time.time()),
			
 
				-                            1,
			
 
				-                            cls.function.generateGzhId(obj["ContentUrl"]),
			
 
				-                        ),
			
 
				-                    )
			
 
				-                except Exception as e:
			
 
				-                    print(e)
			
 
				-
			
 
				-    # @classmethod
			
 
				-    # def searchResultFilter(cls, filter_type, info):
			
 
				-    #     """
			
 
				-    #     搜索结果过滤
			
 
				-    #     :param info: 待过滤信息
			
 
				-    #     :param filter_type: 过滤类型，account表示账号过滤， article表示文章过滤
			
 
				-    #     :return: 过滤后的结果
			
 
				-    #     """
			
 
				-    #     match filter_type:
			
 
				-    #         case "account":
			
 
				-    #             return account
			
 
				-
			
 
				-
			
 
				-if __name__ == "__main__":
			
 
				-    weixin = weixinRelationAccountGoodArticles()
			
 
				-    # 获取内部账号
			
 
				-    inner_account_list = weixin.findInnerAccount()
			
 
				-    for source_account in inner_account_list[:1]:
			
 
				-        accountArticlesDataFrame = weixin.getEachAccountArticle(
			
 
				-            account_id=source_account
			
 
				-        )
			
 
				-        goodArticles = weixin.filterGoodArticle(accountArticlesDataFrame)
			
 
				-        for title in goodArticles:
			
 
				-            account_list = weixin.searchGoodArticlesAccounts(
			
 
				-                source_account=source_account, source_title=title
			
 
				-            )
			
 
				-            print(title)
			
 
				-            print(source_account)
			
 
				-            for associated_account in account_list:
			
 
				-                source_title = associated_account[0]
			
 
				-                associated_account_info = associated_account[1]
			
 
				-                account_name = associated_account_info["data"]["data"]["account_name"]
			
 
				-                gh_id = associated_account_info["data"]["data"]["wx_gh"]
			
 
				-                if '新闻' in account_name:
			
 
				-                    continue
			
 
				-                elif '央视' in account_name:
			
 
				-                    continue
			
 
				-                else:
			
 
				-                    # 初始化账号
			
 
				-                    weixin.initAccount(gh_id=gh_id, account_name=account_name)
			
 
				-                    weixin.putIntoAssociationGraph(
			
 
				-                        gh_id=gh_id,
			
 
				-                        account_name=account_name,
			
 
				-                        source_account=source_account,
			
 
				-                        source_title=title
			
 
				-                    )