Ver Fonte

修改排序算法-返回重复视频

罗俊辉 há 1 ano atrás
pai
commit
bd2f012dea

+ 64 - 0
applications/aliyunLog.py

@@ -0,0 +1,64 @@
+"""
+@author: luojunhui
+"""
+import time
+import json
+from aliyun.log import LogClient, PutLogsRequest, LogItem
+
+
+class AliyunArticleLog(object):
+    """
+    Aliyun 长文日志
+    """
+    def __init__(self, request_id, alg):
+        self.request_id = request_id
+        self.alg = alg
+
+    def log(self, code, mode="prod", msg=None, data=None):
+        """
+        :param mode: 生产环境:prod, 测试环境: dev
+        :param code: 状态码
+        :param msg: 消息
+        :param data: json
+        """
+        if data is None:
+            data = {}
+        accessKeyId = "LTAIP6x1l3DXfSxm"
+        accessKey = "KbTaM9ars4OX3PMS6Xm7rtxGr1FLon"
+        project = "changwen-alg"
+        log_store = "long_articles_algorithm"
+        endpoint = "cn-hangzhou.log.aliyuncs.com"
+
+        # 创建 LogClient 实例
+        client = LogClient(endpoint, accessKeyId, accessKey)
+        log_group = []
+        log_item = LogItem()
+        contents = [
+            (f"mode", str(mode)),
+            (f"code", str(code)),
+            (f"alg", str(self.alg)),
+            (f"msg", str(msg)),
+            (f"data", json.dumps(data, ensure_ascii=False) if data else ""),
+            (f"request_id", str(self.request_id)),
+            ("timestamp", str(int(time.time()))),
+        ]
+
+        log_item.set_contents(contents)
+        log_group.append(log_item)
+        # 写入日志
+        request = PutLogsRequest(
+            project=project,
+            logstore=log_store,
+            topic="",
+            source="",
+            logitems=log_group,
+            compress=False,
+        )
+        try:
+            client.put_logs(request)
+        except Exception as e:
+            print("日志失败")
+            print(e)
+
+
+

+ 3 - 2
applications/asyncMySQL.py

@@ -48,13 +48,14 @@ class AsyncMySQLClient(object):
                 result = await cursor.fetchall()
                 return result
 
-    async def async_insert(self, sql):
+    async def async_insert(self, sql, params):
         """
         insert and update method
+        :param params:
         :param sql:
         :return:
         """
         async with self.app.mysql_pool.acquire() as coon:
             async with coon.cursor() as cursor:
-                await cursor.execute(sql)
+                await cursor.execute(sql, params)
                 await coon.commit()

+ 3 - 3
applications/functions/article_tools.py

@@ -21,8 +21,8 @@ def title_sim_v2(title_a, title_b, thredhold=0.8):
 
 
 def title_sim_v2_by_list(title_target, title_list):
-    for title, url in title_list:
+    for title in title_list:
         sim_score = title_sim_v2(title_target, title)
         if sim_score:
-            return (title, url)
-    return None
+            return True
+    return False

+ 79 - 0
applications/pipeline.py

@@ -0,0 +1,79 @@
+"""
+@author: luojunhui
+"""
+import requests
+
+from applications.functions import title_sim_v2_by_list
+from applications.functions import get_article_title_url_list
+
+
+class LongArticlesPipeline(object):
+    """
+    Long articles Pipeline
+    """
+
+    @classmethod
+    def history_exists(cls, title, account_nickname, plan_name):
+        """
+        判断是否存储
+        :param plan_name:
+        :param title:
+        :param account_nickname:
+        :return:
+        """
+        if "【1】" in plan_name or "【2】" in plan_name:
+            index_list = [1, 2]
+        else:
+            index_list = [1, 2, 3, 4, 5, 6, 7, 8]
+        account_title_list = get_article_title_url_list(
+            account_nickname,
+            index_list=index_list
+        )
+        sim_res = title_sim_v2_by_list(title, account_title_list)
+        if sim_res:
+            return True
+        return False
+
+    @classmethod
+    def article_safe(cls, title):
+        """
+        判断文章是否安全
+        """
+        url = "http://192.168.100.31:8177/sensitive/is_sensitive"
+        body = {
+            "text": title
+        }
+        response = requests.post(
+            url=url,
+            json=body,
+            headers={"Content-Type": "application/json"}
+        )
+        return response.json()['is_sensitive']
+
+    @classmethod
+    def deal(cls, article_obj):
+        """
+        :param article_obj:
+        :return:
+        """
+        history_exists_flag = cls.history_exists(
+            title=article_obj['title'],
+            account_nickname=article_obj['crawlerAccountName'],
+            plan_name=article_obj['producePlanName']
+        )
+        if history_exists_flag:
+            response = {
+                "fileterReason": "历史已发布文章",
+                "status": True
+            }
+            return response
+        else:
+            safe_flag = cls.article_safe(title=article_obj['title'])
+            if safe_flag:
+                response = {
+                    "fileterReason": "安全违规",
+                    "status": True
+                }
+                return response
+            else:
+                return False

+ 217 - 67
routes/AccountArticleRank.py

@@ -1,27 +1,36 @@
 """
 @author: luojunhui
 """
+import json
+import time
 
-from applications.functions import ArticleRank
-from applications.functions import title_sim_v2_by_list
-from applications.functions import get_article_title_url_list
+from uuid import uuid4
 
+from applications.aliyunLog import AliyunArticleLog
+from applications.functions import ArticleRank, title_sim_v2_by_list
+from applications.pipeline import LongArticlesPipeline
 
-def has_same(title, account_nickname):
+
+def deduplication(rank1, rank2, rank3):
     """
-    判断是否存储
-    :param title:
-    :param account_nickname:
+    标题相似度去重
     :return:
     """
-    account_title_list = get_article_title_url_list(
-        account_nickname,
-        # max_time='20240603'
-    )
-    sim_res = title_sim_v2_by_list(title, account_title_list)
-    if sim_res:
-        return True
-    return False
+    dup_list = []
+    final_result = []
+    for item_list in [rank1, rank2, rank3]:
+        result = []
+        if item_list:
+            for item in item_list:
+                title = item['title']
+                if title_sim_v2_by_list(title, dup_list):
+                    print("标题重复,已经过滤\t", title)
+                    continue
+                else:
+                    result.append(item)
+                    dup_list.append(title)
+        final_result.append(result)
+    return final_result[0], final_result[1], final_result[2]
 
 
 class AccountArticleRank(object):
@@ -34,6 +43,7 @@ class AccountArticleRank(object):
         :param params: 请求参数
         :param mysql_client: 数据库链接池
         """
+        self.filter_list = None
         self.publishArticleList = None
         self.publishNum = None
         self.strategy = None
@@ -42,6 +52,23 @@ class AccountArticleRank(object):
         self.accountId = None
         self.params = params
         self.mysql_client = mysql_client
+        self.request_id = "alg-{}-{}".format(uuid4(), int(time.time()))
+        self.logger = AliyunArticleLog(request_id=self.request_id, alg="ArticleRank")
+        self.pipeline = LongArticlesPipeline()
+
+    def filter(self):
+        """
+        过滤器
+        """
+        self.publishArticleList = []
+        self.filter_list = []
+        for item in self.params['publishArticleList']:
+            flag = self.pipeline.deal(item)
+            if flag:
+                item['filterReason'] = flag['fileterReason']
+                self.filter_list.append(item)
+            else:
+                self.publishArticleList.append(item)
 
     async def check_params(self):
         """
@@ -54,9 +81,12 @@ class AccountArticleRank(object):
             self.ghId = self.params["ghId"]
             self.strategy = self.params["strategy"]
             self.publishNum = self.params["publishNum"]
-            self.publishArticleList = self.params["publishArticleList"]
-            self.title_list = [i["title"] for i in self.publishArticleList]
-            self.content_list = [i["content"] for i in self.publishArticleList]
+            self.filter()
+            self.logger.log(
+                code="1001",
+                msg="参数校验成功",
+                data=self.params
+            )
             return None
         except Exception as e:
             response = {
@@ -64,55 +94,162 @@ class AccountArticleRank(object):
                 "info": "params check failed, params : {} is not correct".format(e),
                 "code": 0,
             }
+            self.logger.log(
+                code="1002",
+                msg="参数校验失败--{}".format(e),
+                data=self.params
+            )
             return response
 
+    async def basic_rank(self):
+        """
+        基础排序
+        :return:
+        """
+        # 第一步把所有文章标题分为3组
+        article_list1_ori = [i for i in self.publishArticleList if "【1】" in i['producePlanName']]
+        article_list2_ori = [i for i in self.publishArticleList if "【2】" in i['producePlanName']]
+        article_list3_ori = [i for i in self.publishArticleList if
+                             not i in article_list1_ori and not i in article_list2_ori]
+
+        # # 全局去重,保留优先级由  L1 --> L2 --> L3
+        # hash_map = {}
+        #
+        # article_list1 = []
+        # for i in article_list1_ori:
+        #     title = i['title']
+        #     if hash_map.get(title):
+        #         continue
+        #     else:
+        #         article_list1.append(i)
+        #         hash_map[title] = 1
+        #
+        # article_list2 = []
+        # for i in article_list2_ori:
+        #     title = i['title']
+        #     if hash_map.get(title):
+        #         continue
+        #     else:
+        #         article_list2.append(i)
+        #         hash_map[title] = 2
+        #
+        # article_list3 = []
+        # for i in article_list3_ori:
+        #     title = i['title']
+        #     if hash_map.get(title):
+        #         continue
+        #     else:
+        #         article_list3.append(i)
+        #         hash_map[title] = 1
+
+        # 第二步对article_list1, article_list3按照得分排序, 对article_list2按照播放量排序
+        if article_list1_ori:
+            rank1 = ArticleRank().rank(
+                account_list=[self.accountName],
+                text_list=[i['title'] for i in article_list1_ori]
+            )
+            score_list1 = rank1[self.accountName]['score_list']
+            ranked_1 = []
+            for index, value in enumerate(score_list1):
+                obj = article_list1_ori[index]
+                obj['score'] = value + 1000
+                ranked_1.append(obj)
+            ranked_1 = sorted(ranked_1, key=lambda x: x['score'], reverse=True)
+        else:
+            ranked_1 = []
+        # rank2
+        if article_list2_ori:
+            for item in article_list2_ori:
+                item['score'] = 100
+            ranked_2 = sorted(article_list2_ori, key=lambda x: x['crawlerViewCount'], reverse=True)
+        else:
+            ranked_2 = []
+
+        # rank3
+        if article_list3_ori:
+            rank3 = ArticleRank().rank(
+                account_list=[self.accountName],
+                text_list=[i['title'] for i in article_list3_ori]
+            )
+            score_list3 = rank3[self.accountName]['score_list']
+            ranked_3 = []
+            for index, value in enumerate(score_list3):
+                obj = article_list3_ori[index]
+                obj['score'] = value
+                ranked_3.append(obj)
+            ranked_3 = sorted(ranked_3, key=lambda x: x['score'], reverse=True)
+        else:
+            ranked_3 = []
+        self.logger.log(
+            code="1004",
+            msg="排序完成",
+            data={
+                "rank1": ranked_1,
+                "rank2": ranked_2,
+                "rank3": ranked_3
+            }
+        )
+        return ranked_1, ranked_2, ranked_3
+
     async def rank_v1(self):
         """
         Rank Version 1
         :return:
         """
         try:
-            rank_info = ArticleRank().rank(
-                account_list=[self.accountName], text_list=self.title_list
-            )
-            score_list = rank_info[self.accountName]["score_list"]
-
-            title_score_dict = {}
-            for index, item in enumerate(self.title_list):
-                title_score_dict[item] = score_list[index]
-
-            result_list = []
-            for obj in self.publishArticleList:
-                if title_score_dict.get(obj["title"]):
-                    produce_plan_name = obj['producePlanName']
-                    if "【1】" in produce_plan_name:
-                        obj["score"] = title_score_dict[obj["title"]] + 1000
-                    elif "【2】" in produce_plan_name:
-                        obj["score"] = title_score_dict[obj["title"]] + 100
-                    else:
-                        obj["score"] = title_score_dict[obj["title"]]
-                    result_list.append(obj)
-
-            sorted_list = sorted(result_list, key=lambda x: x["score"], reverse=True)
-            result = {
-                "accountId": self.accountId,
-                "accountName": self.accountName,
-                "ghId": self.ghId,
-                "strategy": self.strategy,
-                "publishNum": self.publishNum,
-                "rank_list": sorted_list[: self.publishNum],
-            }
-        except Exception as e:
-            result = {
-                "accountId": self.accountId,
-                "accountName": self.accountName,
-                "ghId": self.ghId,
-                "strategy": self.strategy,
-                "publishNum": self.publishNum,
-                "rank_list": self.publishArticleList[: self.publishNum],
-            }
-        response = {"status": "Rank Success", "data": result, "code": 1}
-        return response
+            ranked_1_d, ranked_2_d, ranked_3_d = await self.basic_rank()
+            ranked_1, ranked_2, ranked_3 = deduplication(ranked_1_d, ranked_2_d, ranked_3_d)
+            try:
+                L = []
+                if ranked_1:
+                    L.append(ranked_1[0])
+                    if ranked_2:
+                        L.append(ranked_2[0])
+                else:
+                    if ranked_2:
+                        if len(ranked_2) > 1:
+                            for i in ranked_2[:2]:
+                                L.append(i)
+                        else:
+                            L.append(ranked_2[0])
+                for item in ranked_3:
+                    L.append(item)
+
+                result = {
+                    "accountId": self.accountId,
+                    "accountName": self.accountName,
+                    "ghId": self.ghId,
+                    "strategy": self.strategy,
+                    "publishNum": self.publishNum,
+                    "rank_list": L[:self.publishNum],
+                    "filter_list": self.filter_list
+                }
+                self.logger.log(
+                    code=1006,
+                    msg="rank successfully",
+                    data=result
+                )
+                response = {"status": "Rank Success", "data": result, "code": 1}
+            except Exception as e:
+                result = {
+                    "accountId": self.accountId,
+                    "accountName": self.accountName,
+                    "ghId": self.ghId,
+                    "strategy": self.strategy,
+                    "publishNum": self.publishNum,
+                    "rank_list": self.publishArticleList[: self.publishNum],
+                    "filter_list": self.filter_list
+                }
+                self.logger.log(
+                    code=1007,
+                    msg="rank failed because of {}".format(e),
+                    data=result
+                )
+                response = {"status": "Rank Fail", "data": result, "code": 1}
+            return response
+        except:
+            result = {"code": 2, "info": "account is not exist"}
+            return result
 
     async def rank_v2(self):
         """
@@ -149,14 +286,34 @@ class AccountArticleRank(object):
         """
         match self.strategy:
             case "ArticleRankV1":
+                self.logger.log(
+                    code="1003",
+                    msg="命中排序策略1"
+                )
                 return await self.rank_v1()
             case "ArticleRankV2":
+                self.logger.log(
+                    code="1003",
+                    msg="命中排序策略2"
+                )
                 return await self.rank_v2()
             case "ArticleRankV3":
+                self.logger.log(
+                    code="1003",
+                    msg="命中排序策略3"
+                )
                 return await self.rank_v3()
             case "ArticleRankV4":
+                self.logger.log(
+                    code="1003",
+                    msg="命中排序策略4"
+                )
                 return await self.rank_v4()
             case "ArticleRankV5":
+                self.logger.log(
+                    code="1003",
+                    msg="命中排序策略5"
+                )
                 return await self.rank_v5()
 
     async def deal(self):
@@ -168,11 +325,4 @@ class AccountArticleRank(object):
         if error_params:
             return error_params
         else:
-            try:
-                self.title_list = [
-                    i for i in self.title_list if not has_same(i, self.accountName)
-                ]
-                return await self.choose_strategy()
-            except Exception as e:
-                result = {"code": 2, "info": "account is not exist"}
-                return result
+            return await self.choose_strategy()

+ 2 - 4
routes/__init__.py

@@ -6,7 +6,7 @@ from quart import Blueprint, jsonify, request
 
 from .AccountArticleRank import AccountArticleRank
 
-blueprint = Blueprint('LongArticlesAlgServer', __name__)
+blueprint = Blueprint("LongArticlesAlgServer", __name__)
 
 
 def AlgRoutes(mysql_client):
@@ -21,9 +21,7 @@ def AlgRoutes(mysql_client):
         测试服务连通性
         :return:
         """
-        response = {
-            "msg": "Hello, World! Hello, Future"
-        }
+        response = {"msg": "Hello, World! Hello, Future"}
         return jsonify(response)
 
     @blueprint.route("/articleRank", methods=["POST"])

Diff do ficheiro suprimidas por serem muito extensas
+ 3 - 2
test/rank_dev.py


Alguns ficheiros não foram mostrados porque muitos ficheiros mudaram neste diff