11 bulan lalu · b518dd5772
--- a/flow_pool/__init__.py
+++ b/flow_pool/__init__.py
@@ -0,0 +1,3 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+"""
			
--- a/flow_pool/exit_article_with_title.py
+++ b/flow_pool/exit_article_with_title.py
@@ -0,0 +1,186 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+"""
			
 
				+import traceback
			
 
				+
			
 
				+import pandas as pd
			
 
				+
			
 
				+from applications import PQMySQL, longArticlesMySQL, bot, log
			
 
				+from applications.aiditApi import get_generated_article_list
			
 
				+
			
 
				+
			
 
				+def get_level_up_articles() -> set:
			
 
				+    """
			
 
				+    :return:
			
 
				+    """
			
 
				+    generate_pool_ids = [
			
 
				+        "20240804003153130851174",
			
 
				+        "20240802171417146947657",
			
 
				+        "20240802143345289374071",
			
 
				+    ]
			
 
				+    good_title_set = set()
			
 
				+    for pool_id in generate_pool_ids:
			
 
				+        articles = get_generated_article_list(pool_id)
			
 
				+        titles = [article[1] for article in articles]
			
 
				+        good_title_set.update(titles)
			
 
				+    return good_title_set
			
 
				+
			
 
				+
			
 
				+class ArticleTitleStatusManager(object):
			
 
				+    """
			
 
				+    文章退场表格维护
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        self.INIT_STATUS = 0
			
 
				+        self.pq_client = None
			
 
				+        self.lam_client = None
			
 
				+
			
 
				+    def init_database(self) -> bool:
			
 
				+        """
			
 
				+        初始化数据库
			
 
				+        :return:
			
 
				+        """
			
 
				+        try:
			
 
				+            self.pq_client = PQMySQL()
			
 
				+        except Exception as e:
			
 
				+            bot(
			
 
				+                title="文章退场管理任务，数据库连接失败",
			
 
				+                detail={
			
 
				+                    "e": str(e),
			
 
				+                    "error_msg": traceback.format_exc(),
			
 
				+                    "server": "old server"
			
 
				+                }
			
 
				+            )
			
 
				+            return False
			
 
				+
			
 
				+        try:
			
 
				+            self.lam_client = longArticlesMySQL()
			
 
				+        except Exception as e:
			
 
				+            bot(
			
 
				+                title="文章退场管理任务，数据库连接失败",
			
 
				+                detail={
			
 
				+                    "e": str(e),
			
 
				+                    "error_msg": traceback.format_exc(),
			
 
				+                    "server": "new server"
			
 
				+                }
			
 
				+            )
			
 
				+        return True
			
 
				+
			
 
				+    def get_bad_articles(self, read_times_on_avg_threshold, discovery_times_threshold) -> list[str]:
			
 
				+        """
			
 
				+        找出质量很差的文章标题，将该标题设置为退场状态
			
 
				+        :return:
			
 
				+        """
			
 
				+        sql = f"""
			
 
				+            SELECT
			
 
				+                title, max(read_rate) as max_rate, count(1) as title_count
			
 
				+            FROM
			
 
				+                datastat_sort_strategy
			
 
				+            WHERE position > 2 and fans > 10000
			
 
				+            GROUP BY title
			
 
				+            HAVING title_count >= {discovery_times_threshold} and max_rate < {read_times_on_avg_threshold};
			
 
				+        """
			
 
				+        articles = self.lam_client.select(sql)
			
 
				+        return [i[0] for i in articles]
			
 
				+
			
 
				+    def save_titles(self, title_list, status) -> int:
			
 
				+        """
			
 
				+        修改标题状态
			
 
				+        :param status:
			
 
				+        :param title_list:
			
 
				+        :return: None
			
 
				+        """
			
 
				+        fail_list = []
			
 
				+        insert_count = 0
			
 
				+        for title in title_list:
			
 
				+            insert_sql = f"""
			
 
				+                INSERT INTO cold_start_title_pool
			
 
				+                (title, status)
			
 
				+                values
			
 
				+                (%s, %s)
			
 
				+            """
			
 
				+            try:
			
 
				+                self.lam_client.update(
			
 
				+                    sql=insert_sql,
			
 
				+                    params=(title, status)
			
 
				+                )
			
 
				+                insert_count += 1
			
 
				+            except Exception as e:
			
 
				+                update_sql = f"""
			
 
				+                    UPDATE cold_start_title_pool
			
 
				+                    SET status = %s
			
 
				+                    where title = %s and status = %s;
			
 
				+                """
			
 
				+                try:
			
 
				+                    self.lam_client.update(
			
 
				+                        sql=update_sql,
			
 
				+                        params=(status, title, self.INIT_STATUS)
			
 
				+                    )
			
 
				+                except Exception as e:
			
 
				+                    error_msg = traceback.format_exc()
			
 
				+                    log(
			
 
				+                        task="article_exit_with_title",
			
 
				+                        function="save_titles",
			
 
				+                        status="fail",
			
 
				+                        data={
			
 
				+                            "e": str(e),
			
 
				+                            "error_msg": error_msg,
			
 
				+                        }
			
 
				+                    )
			
 
				+                    fail_list.append(title)
			
 
				+
			
 
				+        if fail_list:
			
 
				+            bot(
			
 
				+                title="冷启动文章标题退场，sql操作失败",
			
 
				+                detail=fail_list
			
 
				+            )
			
 
				+            return -1
			
 
				+        else:
			
 
				+            return insert_count
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    """
			
 
				+    main function
			
 
				+    :return:
			
 
				+    """
			
 
				+    UP_LEVEL_STATUS = 1
			
 
				+    ARTICLE_EXIT_STATUS = -1
			
 
				+    READ_TIMES_ON_AVG_THRESHOLD = 0.5
			
 
				+    DISCOVERY_TIMES_THRESHOLD = 10
			
 
				+
			
 
				+    article_title_manager = ArticleTitleStatusManager()
			
 
				+    article_title_manager.init_database()
			
 
				+
			
 
				+    # 处理晋级标题
			
 
				+    up_level_title = get_level_up_articles()
			
 
				+    up_level_success_count = article_title_manager.save_titles(
			
 
				+        title_list=up_level_title,
			
 
				+        status=UP_LEVEL_STATUS
			
 
				+    )
			
 
				+
			
 
				+    # 处理退场标题
			
 
				+    exit_article_list = article_title_manager.get_bad_articles(
			
 
				+        read_times_on_avg_threshold=READ_TIMES_ON_AVG_THRESHOLD,
			
 
				+        discovery_times_threshold=DISCOVERY_TIMES_THRESHOLD
			
 
				+    )
			
 
				+    exit_success_count = article_title_manager.save_titles(
			
 
				+        title_list=exit_article_list,
			
 
				+        status=ARTICLE_EXIT_STATUS)
			
 
				+
			
 
				+    bot(
			
 
				+        title="冷启动文章晋级/退场完成",
			
 
				+        detail={
			
 
				+            "晋级文章数量": up_level_success_count,
			
 
				+            "退场文章数量": exit_success_count,
			
 
				+            "阅读均值倍数阈值": READ_TIMES_ON_AVG_THRESHOLD,
			
 
				+            "探索次数阈值": DISCOVERY_TIMES_THRESHOLD
			
 
				+        },
			
 
				+        mention=False
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()
			
 
				+
			
--- a/flow_pool/upLevel.py
+++ b/flow_pool/upLevel.py
--- a/stratrgy/__init__.py
+++ b/stratrgy/__init__.py
@@ -1,4 +0,0 @@
 
				-"""
			
 
				-@author: luojunhui
			
 
				-"""
			
 
				-from .strategy import ArticlePoolStrategy
			
--- a/stratrgy/distribution.py
+++ b/stratrgy/distribution.py
@@ -1,140 +0,0 @@
 
				-"""
			
 
				-@author: luojunhui
			
 
				-分发逻辑
			
 
				-"""
			
 
				-import json
			
 
				-import datetime
			
 
				-from applications import PQMySQL, WeixinSpider
			
 
				-from tqdm import tqdm
			
 
				-from config import accountBaseInfo
			
 
				-
			
 
				-
			
 
				-class ArticleDistribution(object):
			
 
				-    """
			
 
				-    冷启文章分发逻辑
			
 
				-    """
			
 
				-    account_position_dict = {
			
 
				-        "gh_058e41145a0c": 30,
			
 
				-        "gh_0e4fd9e88386": 30,
			
 
				-        "gh_744cb16f6e16": 30,
			
 
				-        "gh_ac43eb24376d": 30,
			
 
				-        "gh_970460d9ccec": 30,
			
 
				-        "gh_56ca3dae948c": 30,
			
 
				-        "gh_c91b42649690": 30,
			
 
				-        "gh_6d205db62f04": 30,
			
 
				-        "gh_e24da99dc899": 30,
			
 
				-        "gh_4c058673c07e": 30,
			
 
				-        "gh_03d32e83122f": 30,
			
 
				-        "gh_c69776baf2cd": 30,
			
 
				-        "gh_30816d8adb52": 30,
			
 
				-        "gh_789a40fe7935": 30,
			
 
				-        "gh_95ed5ecf9363": 30,
			
 
				-        "gh_3e91f0624545": 30,
			
 
				-        "gh_57573f01b2ee": 30,
			
 
				-        "gh_9877c8541764": 30,
			
 
				-        "gh_6cfd1132df94": 30,
			
 
				-        "gh_008ef23062ee": 30,
			
 
				-        "gh_5ae65db96cb7": 30,
			
 
				-        "gh_be8c29139989": 30,
			
 
				-        "gh_51e4ad40466d": 30,
			
 
				-        "gh_d4dffc34ac39": 30,
			
 
				-        "gh_89ef4798d3ea": 30,
			
 
				-        "gh_b15de7c99912": 30,
			
 
				-        "gh_9f8dc5b0c74e": 30,
			
 
				-        "gh_7b4a5f86d68c": 30,
			
 
				-        "gh_c5cdf60d9ab4": 5,
			
 
				-        "gh_0c89e11f8bf3": 5,
			
 
				-        "gh_e0eb490115f5": 5,
			
 
				-        "gh_a2901d34f75b": 5,
			
 
				-        "gh_d5f935d0d1f2": 30
			
 
				-    }
			
 
				-    pq_mysql_client = PQMySQL()
			
 
				-    Spider = WeixinSpider()
			
 
				-
			
 
				-    @classmethod
			
 
				-    def generate_account_dict(cls):
			
 
				-        """
			
 
				-        生成account_list
			
 
				-        :return:
			
 
				-        """
			
 
				-        account_dict = {}
			
 
				-        for key in accountBaseInfo:
			
 
				-            account_name = accountBaseInfo[key]['accountName']
			
 
				-            account_gh_id = accountBaseInfo[key]['ghId']
			
 
				-            account_dict[account_name] = account_gh_id
			
 
				-        return account_dict
			
 
				-
			
 
				-    @classmethod
			
 
				-    def findArticleScoreList(cls, url_md5):
			
 
				-        """
			
 
				-        获取文章的相关账号的相关性分数
			
 
				-        :param url_md5:
			
 
				-        :return:
			
 
				-        """
			
 
				-        sql = f"""
			
 
				-        select account_score, ori_account from association_articles where url_md5 = '{url_md5}';
			
 
				-        """
			
 
				-        response = cls.pq_mysql_client.select(sql=sql)
			
 
				-        return response
			
 
				-
			
 
				-    @classmethod
			
 
				-    def association_split(cls, article_list):
			
 
				-        """
			
 
				-        联想类型文章分发逻辑
			
 
				-        :param article_list:
			
 
				-        :return:
			
 
				-        """
			
 
				-        account_name_map = cls.generate_account_dict()
			
 
				-        L = {}
			
 
				-        for article in tqdm(article_list):
			
 
				-            link = article['url']
			
 
				-            url_md5 = article['url_md5']
			
 
				-            title = article['title']
			
 
				-            c_id = article['id']
			
 
				-            title_match_list = cls.findArticleScoreList(url_md5)
			
 
				-            title_match_list = sorted(title_match_list, key=lambda x: x[0], reverse=True)
			
 
				-            # print("标题:\t", title)
			
 
				-            # print("相关账号:\t", title_match_list)
			
 
				-            # print("\n")
			
 
				-            for account_tuple in title_match_list:
			
 
				-                account_name = account_tuple[1]
			
 
				-                score = account_tuple[0]
			
 
				-                account_gh_id = account_name_map[account_name]
			
 
				-                if cls.account_position_dict.get(account_gh_id):
			
 
				-                    try:
			
 
				-                        # channel_content_id = cls.Spider.get_article_text(link)['data']['data']['channel_content_id']
			
 
				-                        channel_content_id = c_id
			
 
				-                    except:
			
 
				-                        print(link)
			
 
				-                        channel_content_id = url_md5
			
 
				-                    # channel_content_id = "id"
			
 
				-                    if cls.account_position_dict[account_gh_id] > 0:
			
 
				-                        if L.get(account_gh_id):
			
 
				-                            if len(L[account_gh_id]) >= 10:
			
 
				-                                continue
			
 
				-                            else:
			
 
				-                                L[account_gh_id].append([channel_content_id, score])
			
 
				-                        else:
			
 
				-                            L[account_gh_id] = [[channel_content_id, score]]
			
 
				-                        cls.account_position_dict[account_gh_id] -= 1
			
 
				-                    else:
			
 
				-                        continue
			
 
				-        for account in tqdm(L):
			
 
				-            date_str = datetime.datetime.today().strftime("%Y-%m-%d")
			
 
				-            print(account, date_str, json.dumps(L[account], ensure_ascii=False))
			
 
				-            insert_sql = f"""
			
 
				-            INSERT INTO article_pre_distribute_account
			
 
				-            (gh_id, date, article_list)
			
 
				-            VALUES
			
 
				-            (%s, %s, %s);
			
 
				-            """
			
 
				-            try:
			
 
				-                PQMySQL.update(sql=insert_sql, params=(account, date_str, json.dumps(L[account], ensure_ascii=False)))
			
 
				-            except Exception as e:
			
 
				-                print("插入出现问题----{}".format(e))
			
 
				-        return L
			
 
				-
			
 
				-
			
 
				-
			
 
				-
			
 
				-
			
--- a/stratrgy/strategy.py
+++ b/stratrgy/strategy.py
@@ -1,102 +0,0 @@
 
				-"""
			
 
				-@author: luojunhui
			
 
				-"""
			
 
				-import json
			
 
				-
			
 
				-from tqdm import tqdm
			
 
				-
			
 
				-from applications.functions import Functions
			
 
				-from config import accountBaseInfo, pool_level_detail
			
 
				-
			
 
				-
			
 
				-class ArticlePoolStrategy(object):
			
 
				-    """
			
 
				-    长文策略池
			
 
				-    """
			
 
				-    Fun = Functions()
			
 
				-
			
 
				-    @classmethod
			
 
				-    def getData(cls, article_list):
			
 
				-        """
			
 
				-        :param article_list: 每天召回的文章list
			
 
				-        """
			
 
				-        detail_list = []
			
 
				-        print("查询文章url......")
			
 
				-        id_tuple = [i['id'] for i in article_list]
			
 
				-        detail_dict = cls.Fun.matchLinkByIdTuple(channel_id_tuple=tuple(id_tuple))
			
 
				-        for i in tqdm(article_list):
			
 
				-            content_id = i['id']
			
 
				-            i['gh_key'] = detail_dict[content_id]['gh_key']
			
 
				-            i['url'] = detail_dict[content_id]['url']
			
 
				-            i['title'] = detail_dict[content_id]['title']
			
 
				-            detail_list.append(i)
			
 
				-        print("查询完成, 开始排序")
			
 
				-        return detail_list
			
 
				-
			
 
				-    @classmethod
			
 
				-    def splitByStrategy(cls, detail_list):
			
 
				-        """
			
 
				-        账号-位置-阅读倍数
			
 
				-        :return:
			
 
				-        """
			
 
				-        L = []
			
 
				-        for line in detail_list:
			
 
				-            key = line['gh_key']
			
 
				-            article_read = line['read_count']
			
 
				-            if accountBaseInfo.get(key):
			
 
				-                avg_read = accountBaseInfo[key]['readAvg']
			
 
				-                # 计算比率
			
 
				-                level_rate = article_read / avg_read - 1
			
 
				-                obj = {
			
 
				-                    "key": key,
			
 
				-                    "avg_read": avg_read,
			
 
				-                    "article_read": article_read,
			
 
				-                    "level_rate": level_rate,
			
 
				-                    "url": line['url'],
			
 
				-                    "title": line['title']
			
 
				-                }
			
 
				-                L.append(obj)
			
 
				-        L = sorted(L, key=lambda x: x["level_rate"], reverse=True)
			
 
				-        result = {
			
 
				-            "Level1": [],
			
 
				-            "Level2": []
			
 
				-        }
			
 
				-        c1 = 0
			
 
				-        c2 = 0
			
 
				-        for line in L:
			
 
				-            gh_key = line['key']
			
 
				-            if pool_level_detail.get(gh_key):
			
 
				-                now_level = pool_level_detail[gh_key]
			
 
				-            else:
			
 
				-                now_level = "3"
			
 
				-            match now_level:
			
 
				-                case "2":
			
 
				-                    # 往1层升
			
 
				-                    if line['level_rate'] > 0.1 and line['avg_read'] >= 1000:
			
 
				-                        now_title_list = [i['title'] for i in result['Level1']]
			
 
				-                        if cls.Fun.TitleSimilarity(now_title_list, line['title']):
			
 
				-                            continue
			
 
				-                        else:
			
 
				-                            result['Level1'].append(line)
			
 
				-                case "3":
			
 
				-                    if line['level_rate'] > 0.1 and line['avg_read'] >= 100:
			
 
				-                        now_title_list = [i['title'] for i in result['Level2']]
			
 
				-                        if cls.Fun.TitleSimilarity(now_title_list, line['title']):
			
 
				-                            continue
			
 
				-                        else:
			
 
				-                            result['Level2'].append(line)
			
 
				-                case "1":
			
 
				-                    continue
			
 
				-                    # if line['level_rate'] > 1.0:
			
 
				-                    #     now_title_list = [i['title'] for i in result['Level1']]
			
 
				-                    #     if cls.Fun.TitleSimilarity(now_title_list, line['title']):
			
 
				-                    #         continue
			
 
				-                    #     else:
			
 
				-                    #         result['Level1'].append(line)
			
 
				-        return result
			
 
				-
			
 
				-
			
 
				-
			
 
				-
			
 
				-
			
 
				-