소스 검색

离线推荐优化

luojunhui 6 일 전
부모
커밋
338aa73cd4
3개의 변경된 파일31개의 추가작업 그리고 61개의 파일을 삭제
  1. 17 29
      app/recommend/offline_recommend/core.py
  2. 14 15
      app/recommend/offline_recommend/strategy/get_top_article.py
  3. 0 17
      app/recommend/offline_recommend/strategy/i2i.py

+ 17 - 29
app/recommend/offline_recommend/core.py

@@ -46,31 +46,6 @@ class BaseOffRecommendUtils:
         unsafe_titles_set.update(apollo_bad_titles)
         return unsafe_titles_set
 
-    # 获取账号的 Top 文章
-    async def get_account_top_articles(
-        self, gh_id: str, strategy: str
-    ) -> List[Dict[str, str]]:
-        match strategy:
-            # 后续拓展策略
-            case _:
-                query = GetTopArticleStrategy.base()
-
-        top_articles = await LongArticleDatabaseMapper.Recommend.get_top_articles(
-            pool=self.pool, query=query, gh_id=gh_id
-        )
-
-        return top_articles
-
-    # 获取标题的推荐文章
-    async def get_recommend_articles(self, title: str, strategy: str) -> List:
-        match strategy:
-            # 后续拓展策略
-            case _:
-                query = I2I.base(title)
-
-        recommend_articles = await self.read_from_odps(query)
-        return recommend_articles
-
     # 获取一批标题的推荐标题
     async def get_recommend_articles_for_batch_titles(
         self, title_list: List[str], strategy: str
@@ -82,8 +57,6 @@ class BaseOffRecommendUtils:
             case _:
                 query = I2I.batch_base(title_list)
 
-        print(query)
-
         recommend_articles = await self.read_from_odps(query)
         return recommend_articles
 
@@ -156,8 +129,23 @@ class BaseOfflineRecommend(BaseOffRecommendUtils):
         published_titles: Set[str],
     ):
         gh_id: str = account_info["gh_id"]
-        top_articles = await self.get_account_top_articles(gh_id, strategy)
-        top_titles = [i["title"] for i in top_articles]
+        account_name: str = account_info["account_name"]
+
+        match strategy:
+            case "v1":
+                odps_query = GetTopArticleStrategy.strategy_v1(account_name)
+                top_articles = await self.read_from_odps(odps_query)
+                top_titles = [i.title for i in top_articles]
+
+            case "base":
+                mysql_query = GetTopArticleStrategy.base()
+                top_articles = await LongArticleDatabaseMapper.Recommend.get_top_articles(
+                    pool=self.pool, query=mysql_query, gh_id=gh_id
+                )
+                top_titles = [i["title"] for i in top_articles]
+
+            case _:
+                return []
 
         recommend_articles = await self.get_recommend_articles_for_batch_titles(
             top_titles, strategy

+ 14 - 15
app/recommend/offline_recommend/strategy/get_top_article.py

@@ -15,20 +15,19 @@ class GetTopArticleStrategy(BaseStrategy):
         return query
 
     @staticmethod
-    def strategy_v1() -> str:
-        query = """
-            SELECT title, sum(view_count) as total_view_count, sum(fans) as total_fan_count
-            FROM datastat_sort_strategy
-            WHERE position = 1 and gh_id = %s AND date_str >= '20250501' AND view_count > 1000
+    def strategy_v1(account_name: str, uid_cnt_threshold: int = 100) -> str:
+        odps_query = f"""
+            SELECT  title
+                    ,COUNT(1) AS uid_cnt
+            FROM    article_union_id_mapper
+            WHERE   union_id IN (
+                        SELECT  union_id
+                        FROM    gzh_fans_info
+                        WHERE   account_name = '{account_name}'
+                        AND     dt = MAX_PT('gzh_fans_info')
+                    )
+            AND     accountname != '{account_name}'
             GROUP BY title
-            ORDER BY sum(view_count) / sum(fans) DESC
-            LIMIT 25;
+            HAVING  uid_cnt > {uid_cnt_threshold};
         """
-        return query
-
-    @staticmethod
-    def strategy_v2() -> str:
-        query = """
-            SELECT date_str, title, view_count from datastat_sort_strategy where position = 1 and gh_id = %s
-        """
-        return query
+        return odps_query

+ 0 - 17
app/recommend/offline_recommend/strategy/i2i.py

@@ -48,23 +48,6 @@ class I2I(BaseStrategy):
         """
         return query
 
-    @staticmethod
-    def batch_summary(title_list, limit: int = 500):
-        title_tuple = tuple(title_list)
-        query = f"""
-            SELECT  rec_title
-                    ,SUM(collinear_cnt) AS total_collinear_cnt
-                    ,SUM(base_cnt) AS total_base_cnt
-                    ,SUM(collinear_cnt) / (SUM(base_cnt) + 1000) AS rec_collinear_ratio
-            FROM    loghubods.t2i_records
-            WHERE   src_title IN {title_tuple}
-            AND     data_version = 'v3'
-            GROUP BY rec_title
-            ORDER BY rec_collinear_ratio DESC
-            LIMIT {limit};
-        """
-        return query
-
     @staticmethod
     def strategy_v1(title_list, limit: int = 500):
         title_tuple = tuple(title_list)