Bläddra i källkod

Merge branch '2024-11-01-luojunhui-article-exit-v1' of luojunhui/LongArticlesJob into master

fengzhoutian 11 månader sedan
förälder
incheckning
b518dd5772
6 ändrade filer med 189 tillägg och 246 borttagningar
  1. 3 0
      flow_pool/__init__.py
  2. 186 0
      flow_pool/exit_article_with_title.py
  3. 0 0
      flow_pool/upLevel.py
  4. 0 4
      stratrgy/__init__.py
  5. 0 140
      stratrgy/distribution.py
  6. 0 102
      stratrgy/strategy.py

+ 3 - 0
flow_pool/__init__.py

@@ -0,0 +1,3 @@
+"""
+@author: luojunhui
+"""

+ 186 - 0
flow_pool/exit_article_with_title.py

@@ -0,0 +1,186 @@
+"""
+@author: luojunhui
+"""
+import traceback
+
+import pandas as pd
+
+from applications import PQMySQL, longArticlesMySQL, bot, log
+from applications.aiditApi import get_generated_article_list
+
+
+def get_level_up_articles() -> set:
+    """
+    :return:
+    """
+    generate_pool_ids = [
+        "20240804003153130851174",
+        "20240802171417146947657",
+        "20240802143345289374071",
+    ]
+    good_title_set = set()
+    for pool_id in generate_pool_ids:
+        articles = get_generated_article_list(pool_id)
+        titles = [article[1] for article in articles]
+        good_title_set.update(titles)
+    return good_title_set
+
+
+class ArticleTitleStatusManager(object):
+    """
+    文章退场表格维护
+    """
+
+    def __init__(self):
+        self.INIT_STATUS = 0
+        self.pq_client = None
+        self.lam_client = None
+
+    def init_database(self) -> bool:
+        """
+        初始化数据库
+        :return:
+        """
+        try:
+            self.pq_client = PQMySQL()
+        except Exception as e:
+            bot(
+                title="文章退场管理任务,数据库连接失败",
+                detail={
+                    "e": str(e),
+                    "error_msg": traceback.format_exc(),
+                    "server": "old server"
+                }
+            )
+            return False
+
+        try:
+            self.lam_client = longArticlesMySQL()
+        except Exception as e:
+            bot(
+                title="文章退场管理任务,数据库连接失败",
+                detail={
+                    "e": str(e),
+                    "error_msg": traceback.format_exc(),
+                    "server": "new server"
+                }
+            )
+        return True
+
+    def get_bad_articles(self, read_times_on_avg_threshold, discovery_times_threshold) -> list[str]:
+        """
+        找出质量很差的文章标题,将该标题设置为退场状态
+        :return:
+        """
+        sql = f"""
+            SELECT
+                title, max(read_rate) as max_rate, count(1) as title_count
+            FROM
+                datastat_sort_strategy
+            WHERE position > 2 and fans > 10000
+            GROUP BY title
+            HAVING title_count >= {discovery_times_threshold} and max_rate < {read_times_on_avg_threshold};
+        """
+        articles = self.lam_client.select(sql)
+        return [i[0] for i in articles]
+
+    def save_titles(self, title_list, status) -> int:
+        """
+        修改标题状态
+        :param status:
+        :param title_list:
+        :return: None
+        """
+        fail_list = []
+        insert_count = 0
+        for title in title_list:
+            insert_sql = f"""
+                INSERT INTO cold_start_title_pool
+                (title, status)
+                values
+                (%s, %s)
+            """
+            try:
+                self.lam_client.update(
+                    sql=insert_sql,
+                    params=(title, status)
+                )
+                insert_count += 1
+            except Exception as e:
+                update_sql = f"""
+                    UPDATE cold_start_title_pool
+                    SET status = %s
+                    where title = %s and status = %s;
+                """
+                try:
+                    self.lam_client.update(
+                        sql=update_sql,
+                        params=(status, title, self.INIT_STATUS)
+                    )
+                except Exception as e:
+                    error_msg = traceback.format_exc()
+                    log(
+                        task="article_exit_with_title",
+                        function="save_titles",
+                        status="fail",
+                        data={
+                            "e": str(e),
+                            "error_msg": error_msg,
+                        }
+                    )
+                    fail_list.append(title)
+
+        if fail_list:
+            bot(
+                title="冷启动文章标题退场,sql操作失败",
+                detail=fail_list
+            )
+            return -1
+        else:
+            return insert_count
+
+
+def main():
+    """
+    main function
+    :return:
+    """
+    UP_LEVEL_STATUS = 1
+    ARTICLE_EXIT_STATUS = -1
+    READ_TIMES_ON_AVG_THRESHOLD = 0.5
+    DISCOVERY_TIMES_THRESHOLD = 10
+
+    article_title_manager = ArticleTitleStatusManager()
+    article_title_manager.init_database()
+
+    # 处理晋级标题
+    up_level_title = get_level_up_articles()
+    up_level_success_count = article_title_manager.save_titles(
+        title_list=up_level_title,
+        status=UP_LEVEL_STATUS
+    )
+
+    # 处理退场标题
+    exit_article_list = article_title_manager.get_bad_articles(
+        read_times_on_avg_threshold=READ_TIMES_ON_AVG_THRESHOLD,
+        discovery_times_threshold=DISCOVERY_TIMES_THRESHOLD
+    )
+    exit_success_count = article_title_manager.save_titles(
+        title_list=exit_article_list,
+        status=ARTICLE_EXIT_STATUS)
+
+    bot(
+        title="冷启动文章晋级/退场完成",
+        detail={
+            "晋级文章数量": up_level_success_count,
+            "退场文章数量": exit_success_count,
+            "阅读均值倍数阈值": READ_TIMES_ON_AVG_THRESHOLD,
+            "探索次数阈值": DISCOVERY_TIMES_THRESHOLD
+        },
+        mention=False
+    )
+
+
+if __name__ == '__main__':
+    main()
+

+ 0 - 0
stratrgy/upLevel.py → flow_pool/upLevel.py


+ 0 - 4
stratrgy/__init__.py

@@ -1,4 +0,0 @@
-"""
-@author: luojunhui
-"""
-from .strategy import ArticlePoolStrategy

+ 0 - 140
stratrgy/distribution.py

@@ -1,140 +0,0 @@
-"""
-@author: luojunhui
-分发逻辑
-"""
-import json
-import datetime
-from applications import PQMySQL, WeixinSpider
-from tqdm import tqdm
-from config import accountBaseInfo
-
-
-class ArticleDistribution(object):
-    """
-    冷启文章分发逻辑
-    """
-    account_position_dict = {
-        "gh_058e41145a0c": 30,
-        "gh_0e4fd9e88386": 30,
-        "gh_744cb16f6e16": 30,
-        "gh_ac43eb24376d": 30,
-        "gh_970460d9ccec": 30,
-        "gh_56ca3dae948c": 30,
-        "gh_c91b42649690": 30,
-        "gh_6d205db62f04": 30,
-        "gh_e24da99dc899": 30,
-        "gh_4c058673c07e": 30,
-        "gh_03d32e83122f": 30,
-        "gh_c69776baf2cd": 30,
-        "gh_30816d8adb52": 30,
-        "gh_789a40fe7935": 30,
-        "gh_95ed5ecf9363": 30,
-        "gh_3e91f0624545": 30,
-        "gh_57573f01b2ee": 30,
-        "gh_9877c8541764": 30,
-        "gh_6cfd1132df94": 30,
-        "gh_008ef23062ee": 30,
-        "gh_5ae65db96cb7": 30,
-        "gh_be8c29139989": 30,
-        "gh_51e4ad40466d": 30,
-        "gh_d4dffc34ac39": 30,
-        "gh_89ef4798d3ea": 30,
-        "gh_b15de7c99912": 30,
-        "gh_9f8dc5b0c74e": 30,
-        "gh_7b4a5f86d68c": 30,
-        "gh_c5cdf60d9ab4": 5,
-        "gh_0c89e11f8bf3": 5,
-        "gh_e0eb490115f5": 5,
-        "gh_a2901d34f75b": 5,
-        "gh_d5f935d0d1f2": 30
-    }
-    pq_mysql_client = PQMySQL()
-    Spider = WeixinSpider()
-
-    @classmethod
-    def generate_account_dict(cls):
-        """
-        生成account_list
-        :return:
-        """
-        account_dict = {}
-        for key in accountBaseInfo:
-            account_name = accountBaseInfo[key]['accountName']
-            account_gh_id = accountBaseInfo[key]['ghId']
-            account_dict[account_name] = account_gh_id
-        return account_dict
-
-    @classmethod
-    def findArticleScoreList(cls, url_md5):
-        """
-        获取文章的相关账号的相关性分数
-        :param url_md5:
-        :return:
-        """
-        sql = f"""
-        select account_score, ori_account from association_articles where url_md5 = '{url_md5}';
-        """
-        response = cls.pq_mysql_client.select(sql=sql)
-        return response
-
-    @classmethod
-    def association_split(cls, article_list):
-        """
-        联想类型文章分发逻辑
-        :param article_list:
-        :return:
-        """
-        account_name_map = cls.generate_account_dict()
-        L = {}
-        for article in tqdm(article_list):
-            link = article['url']
-            url_md5 = article['url_md5']
-            title = article['title']
-            c_id = article['id']
-            title_match_list = cls.findArticleScoreList(url_md5)
-            title_match_list = sorted(title_match_list, key=lambda x: x[0], reverse=True)
-            # print("标题:\t", title)
-            # print("相关账号:\t", title_match_list)
-            # print("\n")
-            for account_tuple in title_match_list:
-                account_name = account_tuple[1]
-                score = account_tuple[0]
-                account_gh_id = account_name_map[account_name]
-                if cls.account_position_dict.get(account_gh_id):
-                    try:
-                        # channel_content_id = cls.Spider.get_article_text(link)['data']['data']['channel_content_id']
-                        channel_content_id = c_id
-                    except:
-                        print(link)
-                        channel_content_id = url_md5
-                    # channel_content_id = "id"
-                    if cls.account_position_dict[account_gh_id] > 0:
-                        if L.get(account_gh_id):
-                            if len(L[account_gh_id]) >= 10:
-                                continue
-                            else:
-                                L[account_gh_id].append([channel_content_id, score])
-                        else:
-                            L[account_gh_id] = [[channel_content_id, score]]
-                        cls.account_position_dict[account_gh_id] -= 1
-                    else:
-                        continue
-        for account in tqdm(L):
-            date_str = datetime.datetime.today().strftime("%Y-%m-%d")
-            print(account, date_str, json.dumps(L[account], ensure_ascii=False))
-            insert_sql = f"""
-            INSERT INTO article_pre_distribute_account
-            (gh_id, date, article_list)
-            VALUES
-            (%s, %s, %s);
-            """
-            try:
-                PQMySQL.update(sql=insert_sql, params=(account, date_str, json.dumps(L[account], ensure_ascii=False)))
-            except Exception as e:
-                print("插入出现问题----{}".format(e))
-        return L
-
-
-
-
-

+ 0 - 102
stratrgy/strategy.py

@@ -1,102 +0,0 @@
-"""
-@author: luojunhui
-"""
-import json
-
-from tqdm import tqdm
-
-from applications.functions import Functions
-from config import accountBaseInfo, pool_level_detail
-
-
-class ArticlePoolStrategy(object):
-    """
-    长文策略池
-    """
-    Fun = Functions()
-
-    @classmethod
-    def getData(cls, article_list):
-        """
-        :param article_list: 每天召回的文章list
-        """
-        detail_list = []
-        print("查询文章url......")
-        id_tuple = [i['id'] for i in article_list]
-        detail_dict = cls.Fun.matchLinkByIdTuple(channel_id_tuple=tuple(id_tuple))
-        for i in tqdm(article_list):
-            content_id = i['id']
-            i['gh_key'] = detail_dict[content_id]['gh_key']
-            i['url'] = detail_dict[content_id]['url']
-            i['title'] = detail_dict[content_id]['title']
-            detail_list.append(i)
-        print("查询完成, 开始排序")
-        return detail_list
-
-    @classmethod
-    def splitByStrategy(cls, detail_list):
-        """
-        账号-位置-阅读倍数
-        :return:
-        """
-        L = []
-        for line in detail_list:
-            key = line['gh_key']
-            article_read = line['read_count']
-            if accountBaseInfo.get(key):
-                avg_read = accountBaseInfo[key]['readAvg']
-                # 计算比率
-                level_rate = article_read / avg_read - 1
-                obj = {
-                    "key": key,
-                    "avg_read": avg_read,
-                    "article_read": article_read,
-                    "level_rate": level_rate,
-                    "url": line['url'],
-                    "title": line['title']
-                }
-                L.append(obj)
-        L = sorted(L, key=lambda x: x["level_rate"], reverse=True)
-        result = {
-            "Level1": [],
-            "Level2": []
-        }
-        c1 = 0
-        c2 = 0
-        for line in L:
-            gh_key = line['key']
-            if pool_level_detail.get(gh_key):
-                now_level = pool_level_detail[gh_key]
-            else:
-                now_level = "3"
-            match now_level:
-                case "2":
-                    # 往1层升
-                    if line['level_rate'] > 0.1 and line['avg_read'] >= 1000:
-                        now_title_list = [i['title'] for i in result['Level1']]
-                        if cls.Fun.TitleSimilarity(now_title_list, line['title']):
-                            continue
-                        else:
-                            result['Level1'].append(line)
-                case "3":
-                    if line['level_rate'] > 0.1 and line['avg_read'] >= 100:
-                        now_title_list = [i['title'] for i in result['Level2']]
-                        if cls.Fun.TitleSimilarity(now_title_list, line['title']):
-                            continue
-                        else:
-                            result['Level2'].append(line)
-                case "1":
-                    continue
-                    # if line['level_rate'] > 1.0:
-                    #     now_title_list = [i['title'] for i in result['Level1']]
-                    #     if cls.Fun.TitleSimilarity(now_title_list, line['title']):
-                    #         continue
-                    #     else:
-                    #         result['Level1'].append(line)
-        return result
-
-
-
-
-
-