Browse Source

修改排序算法-返回重复视频

罗俊辉 10 months ago
parent
commit
86c34cb5d7
3 changed files with 127 additions and 98 deletions
  1. 3 3
      applications/functions/article_tools.py
  2. 0 1
      applications/pipeline.py
  3. 124 94
      routes/AccountArticleRank.py

+ 3 - 3
applications/functions/article_tools.py

@@ -21,8 +21,8 @@ def title_sim_v2(title_a, title_b, thredhold=0.8):
 
 
 
 
 def title_sim_v2_by_list(title_target, title_list):
 def title_sim_v2_by_list(title_target, title_list):
-    for title, url in title_list:
+    for title in title_list:
         sim_score = title_sim_v2(title_target, title)
         sim_score = title_sim_v2(title_target, title)
         if sim_score:
         if sim_score:
-            return (title, url)
-    return None
+            return True
+    return False

+ 0 - 1
applications/pipeline.py

@@ -43,7 +43,6 @@ class LongArticlesPipeline(object):
         body = {
         body = {
             "text": title
             "text": title
         }
         }
-        print(body)
         response = requests.post(
         response = requests.post(
             url=url,
             url=url,
             json=body,
             json=body,

+ 124 - 94
routes/AccountArticleRank.py

@@ -7,10 +7,31 @@ import time
 from uuid import uuid4
 from uuid import uuid4
 
 
 from applications.aliyunLog import AliyunArticleLog
 from applications.aliyunLog import AliyunArticleLog
-from applications.functions import ArticleRank
+from applications.functions import ArticleRank, title_sim_v2_by_list
 from applications.pipeline import LongArticlesPipeline
 from applications.pipeline import LongArticlesPipeline
 
 
 
 
+def deduplication(rank1, rank2, rank3):
+    """
+    标题相似度去重
+    :return:
+    """
+    dup_list = []
+    final_result = []
+    for item_list in [rank1, rank2, rank3]:
+        result = []
+        if item_list:
+            for item in item_list:
+                title = item['title']
+                if title_sim_v2_by_list(title, dup_list):
+                    print("标题重复,已经过滤\t", title)
+                    continue
+                else:
+                    result.append(item)
+                    dup_list.append(title)
+        final_result.append(result)
+    return final_result[0], final_result[1], final_result[2]
+
 
 
 class AccountArticleRank(object):
 class AccountArticleRank(object):
     """
     """
@@ -22,6 +43,7 @@ class AccountArticleRank(object):
         :param params: 请求参数
         :param params: 请求参数
         :param mysql_client: 数据库链接池
         :param mysql_client: 数据库链接池
         """
         """
+        self.filter_list = None
         self.publishArticleList = None
         self.publishArticleList = None
         self.publishNum = None
         self.publishNum = None
         self.strategy = None
         self.strategy = None
@@ -34,8 +56,10 @@ class AccountArticleRank(object):
         self.logger = AliyunArticleLog(request_id=self.request_id, alg="ArticleRank")
         self.logger = AliyunArticleLog(request_id=self.request_id, alg="ArticleRank")
         self.pipeline = LongArticlesPipeline()
         self.pipeline = LongArticlesPipeline()
 
 
-
     def filter(self):
     def filter(self):
+        """
+        过滤器
+        """
         self.publishArticleList = []
         self.publishArticleList = []
         self.filter_list = []
         self.filter_list = []
         for item in self.params['publishArticleList']:
         for item in self.params['publishArticleList']:
@@ -78,82 +102,87 @@ class AccountArticleRank(object):
             return response
             return response
 
 
     async def basic_rank(self):
     async def basic_rank(self):
+        """
+        基础排序
+        :return:
+        """
         # 第一步把所有文章标题分为3组
         # 第一步把所有文章标题分为3组
         article_list1_ori = [i for i in self.publishArticleList if "【1】" in i['producePlanName']]
         article_list1_ori = [i for i in self.publishArticleList if "【1】" in i['producePlanName']]
         article_list2_ori = [i for i in self.publishArticleList if "【2】" in i['producePlanName']]
         article_list2_ori = [i for i in self.publishArticleList if "【2】" in i['producePlanName']]
-        article_list3_ori = [i for i in self.publishArticleList if not i in article_list1_ori and not i in article_list2_ori]
-
-        # 全局去重,保留优先级由  L1 --> L2 --> L3
-        hash_map = {}
-
-        article_list1 = []
-        for i in article_list1_ori:
-            title = i['title']
-            if hash_map.get(title):
-                continue
-            else:
-                article_list1.append(i)
-                hash_map[title] = 1
-
-        article_list2 = []
-        for i in article_list2_ori:
-            title = i['title']
-            if hash_map.get(title):
-                continue
-            else:
-                article_list2.append(i)
-                hash_map[title] = 2
+        article_list3_ori = [i for i in self.publishArticleList if
+                             not i in article_list1_ori and not i in article_list2_ori]
 
 
-        article_list3 = []
-        for i in article_list3_ori:
-            title = i['title']
-            if hash_map.get(title):
-                continue
-            else:
-                article_list3.append(i)
-                hash_map[title] = 1
+        # # 全局去重,保留优先级由  L1 --> L2 --> L3
+        # hash_map = {}
+        #
+        # article_list1 = []
+        # for i in article_list1_ori:
+        #     title = i['title']
+        #     if hash_map.get(title):
+        #         continue
+        #     else:
+        #         article_list1.append(i)
+        #         hash_map[title] = 1
+        #
+        # article_list2 = []
+        # for i in article_list2_ori:
+        #     title = i['title']
+        #     if hash_map.get(title):
+        #         continue
+        #     else:
+        #         article_list2.append(i)
+        #         hash_map[title] = 2
+        #
+        # article_list3 = []
+        # for i in article_list3_ori:
+        #     title = i['title']
+        #     if hash_map.get(title):
+        #         continue
+        #     else:
+        #         article_list3.append(i)
+        #         hash_map[title] = 1
 
 
         # 第二步对article_list1, article_list3按照得分排序, 对article_list2按照播放量排序
         # 第二步对article_list1, article_list3按照得分排序, 对article_list2按照播放量排序
-        if article_list1:
+        if article_list1_ori:
             rank1 = ArticleRank().rank(
             rank1 = ArticleRank().rank(
                 account_list=[self.accountName],
                 account_list=[self.accountName],
-                text_list=[i['title'] for i in article_list1]
+                text_list=[i['title'] for i in article_list1_ori]
             )
             )
             score_list1 = rank1[self.accountName]['score_list']
             score_list1 = rank1[self.accountName]['score_list']
             ranked_1 = []
             ranked_1 = []
             for index, value in enumerate(score_list1):
             for index, value in enumerate(score_list1):
-                obj = article_list1[index]
+                obj = article_list1_ori[index]
                 obj['score'] = value + 1000
                 obj['score'] = value + 1000
                 ranked_1.append(obj)
                 ranked_1.append(obj)
-            ranked_1 = sorted(ranked_1, key=lambda x:x['score'], reverse=True)
+            ranked_1 = sorted(ranked_1, key=lambda x: x['score'], reverse=True)
         else:
         else:
             ranked_1 = []
             ranked_1 = []
         # rank2
         # rank2
-        if article_list2:
-            for item in article_list2:
+        if article_list2_ori:
+            for item in article_list2_ori:
                 item['score'] = 100
                 item['score'] = 100
-            ranked_2 = sorted(article_list2, key=lambda x:x['crawlerViewCount'], reverse=True)
+            ranked_2 = sorted(article_list2_ori, key=lambda x: x['crawlerViewCount'], reverse=True)
         else:
         else:
             ranked_2 = []
             ranked_2 = []
 
 
         # rank3
         # rank3
-        if article_list3:
+        if article_list3_ori:
             rank3 = ArticleRank().rank(
             rank3 = ArticleRank().rank(
                 account_list=[self.accountName],
                 account_list=[self.accountName],
-                text_list=[i['title'] for i in article_list3]
+                text_list=[i['title'] for i in article_list3_ori]
             )
             )
             score_list3 = rank3[self.accountName]['score_list']
             score_list3 = rank3[self.accountName]['score_list']
             ranked_3 = []
             ranked_3 = []
             for index, value in enumerate(score_list3):
             for index, value in enumerate(score_list3):
-                obj = article_list3[index]
+                obj = article_list3_ori[index]
                 obj['score'] = value
                 obj['score'] = value
                 ranked_3.append(obj)
                 ranked_3.append(obj)
-            ranked_3 = sorted(ranked_3, key=lambda x:x['score'], reverse=True)
+            ranked_3 = sorted(ranked_3, key=lambda x: x['score'], reverse=True)
         else:
         else:
             ranked_3 = []
             ranked_3 = []
         self.logger.log(
         self.logger.log(
             code="1004",
             code="1004",
-            msg="去重排序完成",
+            msg="排序完成",
             data={
             data={
                 "rank1": ranked_1,
                 "rank1": ranked_1,
                 "rank2": ranked_2,
                 "rank2": ranked_2,
@@ -167,59 +196,60 @@ class AccountArticleRank(object):
         Rank Version 1
         Rank Version 1
         :return:
         :return:
         """
         """
-        # try:
-        ranked_1, ranked_2, ranked_3 = await self.basic_rank()
         try:
         try:
-            L = []
-            if ranked_1:
-                L.append(ranked_1[0])
-                if ranked_2:
-                    L.append(ranked_2[0])
-            else:
-                if ranked_2:
-                    if len(ranked_2) > 1:
-                        for i in ranked_2[:2]:
-                            L.append(i)
-                    else:
+            ranked_1_d, ranked_2_d, ranked_3_d = await self.basic_rank()
+            ranked_1, ranked_2, ranked_3 = deduplication(ranked_1_d, ranked_2_d, ranked_3_d)
+            try:
+                L = []
+                if ranked_1:
+                    L.append(ranked_1[0])
+                    if ranked_2:
                         L.append(ranked_2[0])
                         L.append(ranked_2[0])
-            for item in ranked_3:
-                L.append(item)
+                else:
+                    if ranked_2:
+                        if len(ranked_2) > 1:
+                            for i in ranked_2[:2]:
+                                L.append(i)
+                        else:
+                            L.append(ranked_2[0])
+                for item in ranked_3:
+                    L.append(item)
 
 
-            result = {
-                "accountId": self.accountId,
-                "accountName": self.accountName,
-                "ghId": self.ghId,
-                "strategy": self.strategy,
-                "publishNum": self.publishNum,
-                "rank_list": L[:self.publishNum],
-                "filter_list": self.filter_list
-            }
-            self.logger.log(
-                code=1006,
-                msg="rank successfully",
-                data=result
-            )
-            response = {"status": "Rank Success", "data": result, "code": 1}
-        except Exception as e:
-            result = {
-                "accountId": self.accountId,
-                "accountName": self.accountName,
-                "ghId": self.ghId,
-                "strategy": self.strategy,
-                "publishNum": self.publishNum,
-                "rank_list": self.publishArticleList[: self.publishNum],
-                "filter_list": self.filter_list
-            }
-            self.logger.log(
-                code=1007,
-                msg="rank failed because of {}".format(e),
-                data=result
-            )
-            response = {"status": "Rank Fail", "data": result, "code": 1}
-        return response
-        # except:
-        #     result = {"code": 2, "info": "account is not exist"}
-        #     return result
+                result = {
+                    "accountId": self.accountId,
+                    "accountName": self.accountName,
+                    "ghId": self.ghId,
+                    "strategy": self.strategy,
+                    "publishNum": self.publishNum,
+                    "rank_list": L[:self.publishNum],
+                    "filter_list": self.filter_list
+                }
+                self.logger.log(
+                    code=1006,
+                    msg="rank successfully",
+                    data=result
+                )
+                response = {"status": "Rank Success", "data": result, "code": 1}
+            except Exception as e:
+                result = {
+                    "accountId": self.accountId,
+                    "accountName": self.accountName,
+                    "ghId": self.ghId,
+                    "strategy": self.strategy,
+                    "publishNum": self.publishNum,
+                    "rank_list": self.publishArticleList[: self.publishNum],
+                    "filter_list": self.filter_list
+                }
+                self.logger.log(
+                    code=1007,
+                    msg="rank failed because of {}".format(e),
+                    data=result
+                )
+                response = {"status": "Rank Fail", "data": result, "code": 1}
+            return response
+        except:
+            result = {"code": 2, "info": "account is not exist"}
+            return result
 
 
     async def rank_v2(self):
     async def rank_v2(self):
         """
         """