Parcourir la source

Merge branch '2024-11-28-luojunhui-i2i' of luojunhui/LongArticlesJob into master

luojunhui il y a 11 mois
Parent
commit
0e646d38b6

+ 0 - 52
coldStartTasks/crawler/weixinAssociationCrawler.py

@@ -1,52 +0,0 @@
-"""
-@author: luojunhui
-微信联想抓取
-"""
-import json
-
-from tqdm import tqdm
-
-from applications import PQMySQL
-from applications.spiderTool import SpiderTools
-
-
-class weixinAssociation(object):
-    """
-    微信联想方法
-    """
-    pq_mysql_client = PQMySQL()
-    spider_tool = SpiderTools()
-
-    @classmethod
-    def getAssociationAccounts(cls):
-        """
-        获取已经联想过的账号
-        :return:
-        """
-        select_sql = f"""
-        SELECT distinct(gh_id)
-        FROM long_articles_accounts
-        where is_using = 1 and account_category = 'association';"""
-        account_id_tuple = cls.pq_mysql_client.select(select_sql)
-        account_id_list = [list(i) for i in account_id_tuple]
-        return account_id_list
-
-    @classmethod
-    def deal(cls):
-        """
-        main function
-        :return:
-        """
-        account_info_list = cls.getAssociationAccounts()
-        for line in tqdm(account_info_list[1:]):
-            gh_id = line[0]
-            cls.spider_tool.searchEachAccountArticlesSinglePage(
-                gh_id=gh_id,
-                category="association"
-            )
-
-
-
-
-w = weixinAssociation()
-w.deal()

+ 0 - 235
coldStartTasks/crawler/weixinRelativeAccountCrawler.py

@@ -1,235 +0,0 @@
-"""
-@author: luojunhui
-获取微信相关账号文章下载
-"""
-
-import datetime
-import json
-import time
-
-from pandas import DataFrame
-
-from applications import PQMySQL, WeixinSpider, AlgApi, Functions
-from config import accountBaseInfo
-
-
-class weixinRelationAccountGoodArticles(object):
-    """
-    优质账号抓取
-    """
-
-    pq_mysql_client = PQMySQL()
-    wx_spider = WeixinSpider()
-    function = Functions()
-    spider_client = PQMySQL()
-    nlp = AlgApi()
-
-    @classmethod
-    def findInnerAccount(cls):
-        """
-        找出站内的账号
-        :return:
-        """
-        id_set = set()
-        for key in accountBaseInfo:
-            gh_id = key[:-2]
-            id_set.add(gh_id)
-        return list(id_set)
-
-    @classmethod
-    def initAccount(cls, gh_id, account_name):
-        """
-        初始化账号
-        :param gh_id:
-        :param account_name:
-        :return:
-        """
-        for index in [i for i in range(1, 9)]:
-            insert_sql = f"""
-                INSERT INTO long_articles_accounts
-                (gh_id, account_source, account_name, account_position, account_category, whether_inner_account, is_using)
-                values 
-                (%s, %s, %s, %s, %s, %s, %s);
-            """
-            try:
-                cls.pq_mysql_client.update(
-                    sql=insert_sql,
-                    params=(gh_id, "weixin", account_name, index, "association", 0, 1),
-                )
-            except Exception as e:
-                print(e)
-        print("账号初始化完成")
-
-    @classmethod
-    def putIntoAssociationGraph(cls, gh_id, account_name, source_title, source_account):
-        """
-        将账号加入到联想表中
-        :param gh_id: 联想账号id
-        :param account_name: 联想账号名称
-        :param source_title: 源标题
-        :param source_account: 源账号
-        :return:
-        """
-        insert_sql = f"""
-            INSERT INTO long_articles_assiciation_accounts
-            (account_outside_id, account_name, source_article_title, source_account, association_time, is_using)
-            values 
-            (%s, %s, %s, %s, %s, %s);
-        """
-        try:
-            cls.pq_mysql_client.update(
-                sql=insert_sql,
-                params=(
-                    gh_id,
-                    account_name,
-                    source_title,
-                    source_account,
-                    datetime.datetime.now().__str__(),
-                    1,
-                ),
-            )
-        except Exception as e:
-            print(e)
-
-    @classmethod
-    def getEachAccountArticle(cls, account_id):
-        """
-        获取每个账号的好文章
-        :return:
-        """
-        select_sql = f"""
-                SELECT title, Type, updateTime, ItemIndex, show_view_count
-                FROM official_articles_v2
-                WHERE ghId = '{account_id}';
-            """
-        result = cls.pq_mysql_client.select(select_sql)
-        return DataFrame(
-            result,
-            columns=["title", "Type", "updateTime", "ItemIndex", "show_view_count"],
-        )
-
-    @classmethod
-    def filterGoodArticle(cls, article_data_frame):
-        """
-        获取好的文章
-        :param article_data_frame:
-        :return:
-        """
-        avg_view = article_data_frame["show_view_count"].mean()
-        good_articles = article_data_frame[
-            (article_data_frame["show_view_count"]) > avg_view * 1.1
-        ]
-        return good_articles["title"].values.tolist()
-
-    @classmethod
-    def searchGoodArticlesAccounts(cls, source_account, source_title, base_score=None):
-        """
-        通过标题搜索文章
-        :return:
-        """
-        response = cls.wx_spider.search_articles(source_title)
-        article_list = response["data"]["data"]
-        if article_list:
-            title_list = [i["title"] for i in article_list]
-            title_score_list = cls.nlp.getScoreList(
-                accountName=source_account, title_list=title_list
-            )[source_account]["score_list"]
-            account_list = []
-            for index, score in enumerate(title_score_list):
-                # if score > base_score:
-                article_obj = article_list[index]
-                account_info = cls.wx_spider.get_account_by_url(
-                    content_url=article_obj["url"]
-                )
-                obj = [article_obj["title"], account_info]
-                account_list.append(obj)
-            return account_list
-        else:
-            return []
-
-    @classmethod
-    def insertIntoDataBase(cls, gh_id, article_list):
-        """
-        将数据插入数据库
-        :return:
-        """
-        for article_obj in article_list:
-            detail_article_list = article_obj["AppMsg"]["DetailInfo"]
-            for obj in detail_article_list:
-                try:
-                    show_stat = cls.function.show_desc_to_sta(obj["ShowDesc"])
-                    show_view_count = show_stat.get("show_view_count", 0)
-                    show_like_count = show_stat.get("show_like_count", 0)
-                    insert_sql = f"""
-                                insert into crawler_meta_article
-                                (platform, mode, category, out_account_id, title, link, read_cnt, like_cnt, description, publish_time, crawler_time, status, unique_index)
-                                VALUES 
-                                (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
-                            """
-                    cls.spider_client.update(
-                        sql=insert_sql,
-                        params=(
-                            "weixin",
-                            "account",
-                            "association",
-                            gh_id,
-                            obj["Title"],
-                            obj["ContentUrl"],
-                            show_view_count,
-                            show_like_count,
-                            obj["Digest"],
-                            obj["send_time"],
-                            int(time.time()),
-                            1,
-                            cls.function.generateGzhId(obj["ContentUrl"]),
-                        ),
-                    )
-                except Exception as e:
-                    print(e)
-
-    # @classmethod
-    # def searchResultFilter(cls, filter_type, info):
-    #     """
-    #     搜索结果过滤
-    #     :param info: 待过滤信息
-    #     :param filter_type: 过滤类型,account表示账号过滤, article表示文章过滤
-    #     :return: 过滤后的结果
-    #     """
-    #     match filter_type:
-    #         case "account":
-    #             return account
-
-
-if __name__ == "__main__":
-    weixin = weixinRelationAccountGoodArticles()
-    # 获取内部账号
-    inner_account_list = weixin.findInnerAccount()
-    for source_account in inner_account_list[:1]:
-        accountArticlesDataFrame = weixin.getEachAccountArticle(
-            account_id=source_account
-        )
-        goodArticles = weixin.filterGoodArticle(accountArticlesDataFrame)
-        for title in goodArticles:
-            account_list = weixin.searchGoodArticlesAccounts(
-                source_account=source_account, source_title=title
-            )
-            print(title)
-            print(source_account)
-            for associated_account in account_list:
-                source_title = associated_account[0]
-                associated_account_info = associated_account[1]
-                account_name = associated_account_info["data"]["data"]["account_name"]
-                gh_id = associated_account_info["data"]["data"]["wx_gh"]
-                if '新闻' in account_name:
-                    continue
-                elif '央视' in account_name:
-                    continue
-                else:
-                    # 初始化账号
-                    weixin.initAccount(gh_id=gh_id, account_name=account_name)
-                    weixin.putIntoAssociationGraph(
-                        gh_id=gh_id,
-                        account_name=account_name,
-                        source_account=source_account,
-                        source_title=title
-                    )