罗俊辉 10 месяцев назад
Родитель
Сommit
166cb403e5

+ 17 - 0
applications/articleTools.py

@@ -0,0 +1,17 @@
+"""
+@author: luojunhui
+"""
+
+
+class ArticleDBTools(object):
+    """
+    长文数据库相关功能
+    """
+    def __init__(self, mysql_client):
+        """
+        init mysql
+        :param mysql_client:
+        """
+        self.mysql_client = mysql_client
+
+

+ 0 - 1
requirements.txt

@@ -25,6 +25,5 @@ selenium
 torch~=2.3.1
 tqdm~=4.66.4
 transformers
-
 pydantic~=2.6.4
 similarities~=1.1.7

+ 22 - 1
routes/__init__.py

@@ -4,8 +4,9 @@
 
 from quart import Blueprint, jsonify, request
 
-from .AccountArticleRank import AccountArticleRank
+from .accountArticleRank import AccountArticleRank
 from .nlpServer import NLPServer
+from .articleDBServer import ArticleDB
 
 
 def AlgRoutes(mysql_client, model):
@@ -47,4 +48,24 @@ def AlgRoutes(mysql_client, model):
         response = nlpS.deal()
         return jsonify(response)
 
+    @blueprint.route("/score_list", methods=["POST"])
+    async def articleAccount():
+        """
+        公众号文章功能等接口
+        :return:
+        """
+        params = await request.get_json()
+        return jsonify(params)
+
+    @blueprint.route("/article_db", methods=["POST"])
+    async def articleMysql():
+        """
+        长文数据库相关接口
+        :return:
+        """
+        params = await request.get_json()
+        ADB = ArticleDB(params=params, mysql_client=mysql_client)
+        response = await ADB.deal()
+        return jsonify(response)
+
     return blueprint

+ 46 - 63
routes/AccountArticleRank.py → routes/accountArticleRank.py

@@ -1,6 +1,7 @@
 """
 @author: luojunhui
 """
+
 import random
 import time
 
@@ -23,7 +24,7 @@ def deduplication(rank1, rank2, rank3):
         result = []
         if item_list:
             for item in item_list:
-                title = item['title']
+                title = item["title"]
                 if title_sim_v2_by_list(title, dup_list):
                     # print("标题重复,已经过滤\t", title)
                     continue
@@ -63,11 +64,13 @@ class AccountArticleRank(object):
         """
         self.publishArticleList = []
         self.filter_list = []
-        history_title_dict = self.pipeline.history_title(account_nickname=self.accountName)
-        for item in tqdm(self.params['publishArticleList']):
+        history_title_dict = self.pipeline.history_title(
+            account_nickname=self.accountName
+        )
+        for item in tqdm(self.params["publishArticleList"]):
             flag = self.pipeline.deal(item, self.accountName, history_title_dict)
             if flag:
-                item['filterReason'] = flag['filterReason']
+                item["filterReason"] = flag["filterReason"]
                 self.filter_list.append(item)
             else:
                 self.publishArticleList.append(item)
@@ -86,11 +89,7 @@ class AccountArticleRank(object):
             self.publishNum = self.params["publishNum"]
             print("开始校验参数")
             self.filter()
-            self.logger.log(
-                code="1001",
-                msg="参数校验成功",
-                data=self.params
-            )
+            self.logger.log(code="1001", msg="参数校验成功", data=self.params)
             return None
         except Exception as e:
             response = {
@@ -99,9 +98,7 @@ class AccountArticleRank(object):
                 "code": 0,
             }
             self.logger.log(
-                code="1002",
-                msg="参数校验失败--{}".format(e),
-                data=self.params
+                code="1002", msg="参数校验失败--{}".format(e), data=self.params
             )
             return response
 
@@ -111,10 +108,17 @@ class AccountArticleRank(object):
         :return:
         """
         # 第一步把所有文章标题分为3组
-        article_list1_ori = [i for i in self.publishArticleList if "【1】" in i['producePlanName']]
-        article_list2_ori = [i for i in self.publishArticleList if "【2】" in i['producePlanName']]
-        article_list3_ori = [i for i in self.publishArticleList if
-                             not i in article_list1_ori and not i in article_list2_ori]
+        article_list1_ori = [
+            i for i in self.publishArticleList if "【1】" in i["producePlanName"]
+        ]
+        article_list2_ori = [
+            i for i in self.publishArticleList if "【2】" in i["producePlanName"]
+        ]
+        article_list3_ori = [
+            i
+            for i in self.publishArticleList
+            if not i in article_list1_ori and not i in article_list2_ori
+        ]
 
         # # 全局去重,保留优先级由  L1 --> L2 --> L3
         # hash_map = {}
@@ -150,22 +154,24 @@ class AccountArticleRank(object):
         if article_list1_ori:
             rank1 = ArticleRank().rank(
                 account_list=[self.accountName],
-                text_list=[i['title'] for i in article_list1_ori]
+                text_list=[i["title"] for i in article_list1_ori],
             )
-            score_list1 = rank1[self.accountName]['score_list']
+            score_list1 = rank1[self.accountName]["score_list"]
             ranked_1 = []
             for index, value in enumerate(score_list1):
                 obj = article_list1_ori[index]
-                obj['score'] = value + 1000
+                obj["score"] = value + 1000
                 ranked_1.append(obj)
-            ranked_1 = sorted(ranked_1, key=lambda x: x['score'], reverse=True)
+            ranked_1 = sorted(ranked_1, key=lambda x: x["score"], reverse=True)
         else:
             ranked_1 = []
         # rank2
         if article_list2_ori:
             for item in article_list2_ori:
-                item['score'] = 100
-            ranked_2 = sorted(article_list2_ori, key=lambda x: x['crawlerViewCount'], reverse=True)
+                item["score"] = 100
+            ranked_2 = sorted(
+                article_list2_ori, key=lambda x: x["crawlerViewCount"], reverse=True
+            )
         else:
             ranked_2 = []
 
@@ -173,25 +179,21 @@ class AccountArticleRank(object):
         if article_list3_ori:
             rank3 = ArticleRank().rank(
                 account_list=[self.accountName],
-                text_list=[i['title'] for i in article_list3_ori]
+                text_list=[i["title"] for i in article_list3_ori],
             )
-            score_list3 = rank3[self.accountName]['score_list']
+            score_list3 = rank3[self.accountName]["score_list"]
             ranked_3 = []
             for index, value in enumerate(score_list3):
                 obj = article_list3_ori[index]
-                obj['score'] = value
+                obj["score"] = value
                 ranked_3.append(obj)
-            ranked_3 = sorted(ranked_3, key=lambda x: x['score'], reverse=True)
+            ranked_3 = sorted(ranked_3, key=lambda x: x["score"], reverse=True)
         else:
             ranked_3 = []
         self.logger.log(
             code="1004",
             msg="排序完成",
-            data={
-                "rank1": ranked_1,
-                "rank2": ranked_2,
-                "rank3": ranked_3
-            }
+            data={"rank1": ranked_1, "rank2": ranked_2, "rank3": ranked_3},
         )
         return ranked_1, ranked_2, ranked_3
 
@@ -203,7 +205,9 @@ class AccountArticleRank(object):
         print("开始排序")
         try:
             ranked_1_d, ranked_2_d, ranked_3_d = await self.basic_rank()
-            ranked_1, ranked_2, ranked_3 = deduplication(ranked_1_d, ranked_2_d, ranked_3_d)
+            ranked_1, ranked_2, ranked_3 = deduplication(
+                ranked_1_d, ranked_2_d, ranked_3_d
+            )
             print("去重成功")
             try:
                 L = []
@@ -229,14 +233,10 @@ class AccountArticleRank(object):
                     "ghId": self.ghId,
                     "strategy": self.strategy,
                     "publishNum": self.publishNum,
-                    "rank_list": L[:self.publishNum],
-                    "filter_list": self.filter_list
+                    "rank_list": L[: self.publishNum],
+                    "filter_list": self.filter_list,
                 }
-                self.logger.log(
-                    code=1006,
-                    msg="rank successfully",
-                    data=result
-                )
+                self.logger.log(code=1006, msg="rank successfully", data=result)
                 response = {"status": "Rank Success", "data": result, "code": 1}
             except Exception as e:
                 result = {
@@ -246,12 +246,10 @@ class AccountArticleRank(object):
                     "strategy": self.strategy,
                     "publishNum": self.publishNum,
                     "rank_list": self.publishArticleList[: self.publishNum],
-                    "filter_list": self.filter_list
+                    "filter_list": self.filter_list,
                 }
                 self.logger.log(
-                    code=1007,
-                    msg="rank failed because of {}".format(e),
-                    data=result
+                    code=1007, msg="rank failed because of {}".format(e), data=result
                 )
                 print("排序成功")
                 response = {"status": "Rank Fail", "data": result, "code": 1}
@@ -295,34 +293,19 @@ class AccountArticleRank(object):
         """
         match self.strategy:
             case "ArticleRankV1":
-                self.logger.log(
-                    code="1003",
-                    msg="命中排序策略1"
-                )
+                self.logger.log(code="1003", msg="命中排序策略1")
                 return await self.rank_v1()
             case "ArticleRankV2":
-                self.logger.log(
-                    code="1003",
-                    msg="命中排序策略2"
-                )
+                self.logger.log(code="1003", msg="命中排序策略2")
                 return await self.rank_v2()
             case "ArticleRankV3":
-                self.logger.log(
-                    code="1003",
-                    msg="命中排序策略3"
-                )
+                self.logger.log(code="1003", msg="命中排序策略3")
                 return await self.rank_v3()
             case "ArticleRankV4":
-                self.logger.log(
-                    code="1003",
-                    msg="命中排序策略4"
-                )
+                self.logger.log(code="1003", msg="命中排序策略4")
                 return await self.rank_v4()
             case "ArticleRankV5":
-                self.logger.log(
-                    code="1003",
-                    msg="命中排序策略5"
-                )
+                self.logger.log(code="1003", msg="命中排序策略5")
                 return await self.rank_v5()
 
     async def deal(self):

+ 114 - 0
routes/accountServer.py

@@ -0,0 +1,114 @@
+"""
+@author: luojunhui
+"""
+
+
+def get_account_interest_by_top(account_nickname, min_time, max_time, rate):
+    return 1, 2
+
+
+def get_account_interest_by_avg(account_nickname, min_time, max_time, rate):
+    return 1, 2
+
+
+def get_sim_score_cross_mean(a, b, c):
+    return {"1": "2"}
+
+
+def get_sim_score_cross_avg(a, b, c):
+    return {"1": "2"}
+
+
+class AccountServer(object):
+    """
+    获取标题和公众号文章的相关性
+    """
+
+    def __init__(self, params):
+        self.account_name_list = None
+        self.sim_type = None
+        self.interest_type = None
+        self.min_time = None
+        self.max_time = None
+        self.rate = None
+        self.title_list = None
+        self.params = params
+
+    def checkParams(self):
+        """
+        校验传参
+        :return:
+        """
+        try:
+            self.title_list = self.params['text_list']
+            self.account_name_list = self.params.get("account_nickname_list", [])
+            self.rate = self.params.get("rate", 0.1)
+            self.max_time = self.params.get("max_time", 0.1)
+            self.min_time = self.params.get("min_time", 0.1)
+            self.interest_type = self.params.get("interest_type", "by_top")
+            self.sim_type = self.params.get("sim_type", "mean")
+            return None
+        except Exception as e:
+            response = {
+                "error": "Params error",
+                "detail": str(e)
+            }
+            return response
+
+    def getEachAccountScoreList(self, account_name):
+        """
+        获取和单个账号的相关性分数
+        :return:
+        """
+        try:
+            account_interest, account_weight = (
+                get_account_interest_by_top(
+                    account_nickname=account_name,
+                    min_time=self.min_time,
+                    max_time=self.max_time,
+                    rate=self.rate,
+                )
+                if self.interest_type == "by_top"
+                else get_account_interest_by_avg(
+                    account_nickname=account_name,
+                    min_time=self.min_time,
+                    max_time=self.max_time,
+                    rate=self.rate,
+                )
+            )
+            res = (
+                get_sim_score_cross_mean(self.title_list, account_interest, account_weight)
+                if self.sim_type == "mean"
+                else get_sim_score_cross_avg(self.title_list, account_interest, account_weight)
+            )
+            sim_key = "score_list_mean" if self.sim_type == "mean" else "score_list_avg"
+            return {
+                "score_list": res[sim_key],
+                "text_list_max": res["text_list_max"],
+            }
+        except Exception as e:
+            print(e)
+            return {
+                "score_list": [0] * len(self.title_list),
+                "text_list_max": self.title_list
+            }
+
+    def getAccountListScoreList(self):
+        """
+        获取AccountList中每一个账号的相关性分数
+        :return:
+        """
+        response = {}
+        for accountName in self.account_name_list:
+            if response.get(accountName):
+                continue
+            else:
+                response[accountName] = self.getEachAccountScoreList(account_name=accountName)
+        return response
+
+    def deal(self):
+        """
+        Deal Function
+        :return:
+        """
+        return self.checkParams() if self.checkParams() else self.getAccountListScoreList()

+ 35 - 0
routes/articleDBServer.py

@@ -0,0 +1,35 @@
+"""
+@author: luojunhui
+"""
+from applications.articleTools import ArticleDBTools
+
+
+class ArticleDB(object):
+    """
+    长文数据库功能
+    """
+    def __init__(self, params, mysql_client):
+        self.params = params
+        self.mysql_client = mysql_client
+        self.tools = ArticleDBTools(self.mysql_client)
+
+    def checkParams(self):
+        """
+        校验参数
+        :return:
+        """
+        a = self.params
+
+    def task_schedule(self):
+        """
+        调度任务
+        :return:
+        """
+
+    async def deal(self):
+        """
+        deal function
+        :return:
+        """
+        return {"message": "此接口正在开发中"}
+

+ 1 - 4
routes/nlpServer.py

@@ -56,10 +56,7 @@ class NLPServer(object):
         deal function
         :return:
         """
-        if self.check_params():
-            return self.check_params()
-        else:
-            return self.schedule_function()
+        return self.check_params() if self.check_params() else self.schedule_function()
 
 
 

+ 2 - 3
test/nlp_dev.py

@@ -5,7 +5,7 @@ import json
 import requests
 import time
 
-url = "http://localhost:6060/nlp"
+url = "http://47.98.154.124:6060/nlp"
 
 list_data = {
     "text_list_a": ["凯旋", "毛泽东", "周恩来"],
@@ -20,7 +20,6 @@ body1 = {
     "function": "similarities"
 }
 
-
 body2 = {
     "data": list_data,
     "function": "similarities_cross"
@@ -49,7 +48,7 @@ body5 = {
 headers = {"Content-Type": "application/json"}
 
 a = time.time()
-response = requests.post(url=url, headers=headers, json=body4)
+response = requests.post(url=url, headers=headers, json=body5)
 b = time.time()
 print(json.dumps(response.json(), ensure_ascii=False, indent=4))
 print(b - a)

+ 1 - 0
test/note.txt

@@ -0,0 +1 @@
+scp -i ~/.ssh/ali-denet.pem  -r ~/.cache/huggingface/hub root@47.98.154.124:~/.cache/huggingface/hub

+ 4 - 0
test/test.json

@@ -0,0 +1,4 @@
+{
+    "1": "1",
+    "2": "2"
+}

+ 23 - 0
test/test.py

@@ -0,0 +1,23 @@
+"""
+@author: luojunhui
+"""
+import json
+
+
+path = "test.json"
+
+
+obj = {
+    "1": "1",
+    "2": "2"
+}
+
+with open(path, "w", encoding="utf-8") as f:
+    f.write(json.dumps(obj, ensure_ascii=False, indent=4) + "\n")
+
+
+with open(path, encoding="utf-8") as f:
+    data = json.loads(f.read())
+
+print(data)
+print(type(data))