|
@@ -1,27 +1,36 @@
|
|
|
"""
|
|
|
@author: luojunhui
|
|
|
"""
|
|
|
+import json
|
|
|
+import time
|
|
|
|
|
|
-from applications.functions import ArticleRank
|
|
|
-from applications.functions import title_sim_v2_by_list
|
|
|
-from applications.functions import get_article_title_url_list
|
|
|
+from uuid import uuid4
|
|
|
|
|
|
+from applications.aliyunLog import AliyunArticleLog
|
|
|
+from applications.functions import ArticleRank, title_sim_v2_by_list
|
|
|
+from applications.pipeline import LongArticlesPipeline
|
|
|
|
|
|
-def has_same(title, account_nickname):
|
|
|
+
|
|
|
+def deduplication(rank1, rank2, rank3):
|
|
|
"""
|
|
|
- 判断是否存储
|
|
|
- :param title:
|
|
|
- :param account_nickname:
|
|
|
+ 标题相似度去重
|
|
|
:return:
|
|
|
"""
|
|
|
- account_title_list = get_article_title_url_list(
|
|
|
- account_nickname,
|
|
|
- # max_time='20240603'
|
|
|
- )
|
|
|
- sim_res = title_sim_v2_by_list(title, account_title_list)
|
|
|
- if sim_res:
|
|
|
- return True
|
|
|
- return False
|
|
|
+ dup_list = []
|
|
|
+ final_result = []
|
|
|
+ for item_list in [rank1, rank2, rank3]:
|
|
|
+ result = []
|
|
|
+ if item_list:
|
|
|
+ for item in item_list:
|
|
|
+ title = item['title']
|
|
|
+ if title_sim_v2_by_list(title, dup_list):
|
|
|
+ print("标题重复,已经过滤\t", title)
|
|
|
+ continue
|
|
|
+ else:
|
|
|
+ result.append(item)
|
|
|
+ dup_list.append(title)
|
|
|
+ final_result.append(result)
|
|
|
+ return final_result[0], final_result[1], final_result[2]
|
|
|
|
|
|
|
|
|
class AccountArticleRank(object):
|
|
@@ -34,6 +43,7 @@ class AccountArticleRank(object):
|
|
|
:param params: 请求参数
|
|
|
:param mysql_client: 数据库链接池
|
|
|
"""
|
|
|
+ self.filter_list = None
|
|
|
self.publishArticleList = None
|
|
|
self.publishNum = None
|
|
|
self.strategy = None
|
|
@@ -42,6 +52,23 @@ class AccountArticleRank(object):
|
|
|
self.accountId = None
|
|
|
self.params = params
|
|
|
self.mysql_client = mysql_client
|
|
|
+ self.request_id = "alg-{}-{}".format(uuid4(), int(time.time()))
|
|
|
+ self.logger = AliyunArticleLog(request_id=self.request_id, alg="ArticleRank")
|
|
|
+ self.pipeline = LongArticlesPipeline()
|
|
|
+
|
|
|
+ def filter(self):
|
|
|
+ """
|
|
|
+ 过滤器
|
|
|
+ """
|
|
|
+ self.publishArticleList = []
|
|
|
+ self.filter_list = []
|
|
|
+ for item in self.params['publishArticleList']:
|
|
|
+ flag = self.pipeline.deal(item)
|
|
|
+ if flag:
|
|
|
+ item['filterReason'] = flag['fileterReason']
|
|
|
+ self.filter_list.append(item)
|
|
|
+ else:
|
|
|
+ self.publishArticleList.append(item)
|
|
|
|
|
|
async def check_params(self):
|
|
|
"""
|
|
@@ -54,9 +81,12 @@ class AccountArticleRank(object):
|
|
|
self.ghId = self.params["ghId"]
|
|
|
self.strategy = self.params["strategy"]
|
|
|
self.publishNum = self.params["publishNum"]
|
|
|
- self.publishArticleList = self.params["publishArticleList"]
|
|
|
- self.title_list = [i["title"] for i in self.publishArticleList]
|
|
|
- self.content_list = [i["content"] for i in self.publishArticleList]
|
|
|
+ self.filter()
|
|
|
+ self.logger.log(
|
|
|
+ code="1001",
|
|
|
+ msg="参数校验成功",
|
|
|
+ data=self.params
|
|
|
+ )
|
|
|
return None
|
|
|
except Exception as e:
|
|
|
response = {
|
|
@@ -64,55 +94,162 @@ class AccountArticleRank(object):
|
|
|
"info": "params check failed, params : {} is not correct".format(e),
|
|
|
"code": 0,
|
|
|
}
|
|
|
+ self.logger.log(
|
|
|
+ code="1002",
|
|
|
+ msg="参数校验失败--{}".format(e),
|
|
|
+ data=self.params
|
|
|
+ )
|
|
|
return response
|
|
|
|
|
|
+ async def basic_rank(self):
|
|
|
+ """
|
|
|
+ 基础排序
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ # 第一步把所有文章标题分为3组
|
|
|
+ article_list1_ori = [i for i in self.publishArticleList if "【1】" in i['producePlanName']]
|
|
|
+ article_list2_ori = [i for i in self.publishArticleList if "【2】" in i['producePlanName']]
|
|
|
+ article_list3_ori = [i for i in self.publishArticleList if
|
|
|
+ not i in article_list1_ori and not i in article_list2_ori]
|
|
|
+
|
|
|
+ # # 全局去重,保留优先级由 L1 --> L2 --> L3
|
|
|
+ # hash_map = {}
|
|
|
+ #
|
|
|
+ # article_list1 = []
|
|
|
+ # for i in article_list1_ori:
|
|
|
+ # title = i['title']
|
|
|
+ # if hash_map.get(title):
|
|
|
+ # continue
|
|
|
+ # else:
|
|
|
+ # article_list1.append(i)
|
|
|
+ # hash_map[title] = 1
|
|
|
+ #
|
|
|
+ # article_list2 = []
|
|
|
+ # for i in article_list2_ori:
|
|
|
+ # title = i['title']
|
|
|
+ # if hash_map.get(title):
|
|
|
+ # continue
|
|
|
+ # else:
|
|
|
+ # article_list2.append(i)
|
|
|
+ # hash_map[title] = 2
|
|
|
+ #
|
|
|
+ # article_list3 = []
|
|
|
+ # for i in article_list3_ori:
|
|
|
+ # title = i['title']
|
|
|
+ # if hash_map.get(title):
|
|
|
+ # continue
|
|
|
+ # else:
|
|
|
+ # article_list3.append(i)
|
|
|
+ # hash_map[title] = 1
|
|
|
+
|
|
|
+ # 第二步对article_list1, article_list3按照得分排序, 对article_list2按照播放量排序
|
|
|
+ if article_list1_ori:
|
|
|
+ rank1 = ArticleRank().rank(
|
|
|
+ account_list=[self.accountName],
|
|
|
+ text_list=[i['title'] for i in article_list1_ori]
|
|
|
+ )
|
|
|
+ score_list1 = rank1[self.accountName]['score_list']
|
|
|
+ ranked_1 = []
|
|
|
+ for index, value in enumerate(score_list1):
|
|
|
+ obj = article_list1_ori[index]
|
|
|
+ obj['score'] = value + 1000
|
|
|
+ ranked_1.append(obj)
|
|
|
+ ranked_1 = sorted(ranked_1, key=lambda x: x['score'], reverse=True)
|
|
|
+ else:
|
|
|
+ ranked_1 = []
|
|
|
+ # rank2
|
|
|
+ if article_list2_ori:
|
|
|
+ for item in article_list2_ori:
|
|
|
+ item['score'] = 100
|
|
|
+ ranked_2 = sorted(article_list2_ori, key=lambda x: x['crawlerViewCount'], reverse=True)
|
|
|
+ else:
|
|
|
+ ranked_2 = []
|
|
|
+
|
|
|
+ # rank3
|
|
|
+ if article_list3_ori:
|
|
|
+ rank3 = ArticleRank().rank(
|
|
|
+ account_list=[self.accountName],
|
|
|
+ text_list=[i['title'] for i in article_list3_ori]
|
|
|
+ )
|
|
|
+ score_list3 = rank3[self.accountName]['score_list']
|
|
|
+ ranked_3 = []
|
|
|
+ for index, value in enumerate(score_list3):
|
|
|
+ obj = article_list3_ori[index]
|
|
|
+ obj['score'] = value
|
|
|
+ ranked_3.append(obj)
|
|
|
+ ranked_3 = sorted(ranked_3, key=lambda x: x['score'], reverse=True)
|
|
|
+ else:
|
|
|
+ ranked_3 = []
|
|
|
+ self.logger.log(
|
|
|
+ code="1004",
|
|
|
+ msg="排序完成",
|
|
|
+ data={
|
|
|
+ "rank1": ranked_1,
|
|
|
+ "rank2": ranked_2,
|
|
|
+ "rank3": ranked_3
|
|
|
+ }
|
|
|
+ )
|
|
|
+ return ranked_1, ranked_2, ranked_3
|
|
|
+
|
|
|
async def rank_v1(self):
|
|
|
"""
|
|
|
Rank Version 1
|
|
|
:return:
|
|
|
"""
|
|
|
try:
|
|
|
- rank_info = ArticleRank().rank(
|
|
|
- account_list=[self.accountName], text_list=self.title_list
|
|
|
- )
|
|
|
- score_list = rank_info[self.accountName]["score_list"]
|
|
|
-
|
|
|
- title_score_dict = {}
|
|
|
- for index, item in enumerate(self.title_list):
|
|
|
- title_score_dict[item] = score_list[index]
|
|
|
-
|
|
|
- result_list = []
|
|
|
- for obj in self.publishArticleList:
|
|
|
- if title_score_dict.get(obj["title"]):
|
|
|
- produce_plan_name = obj['producePlanName']
|
|
|
- if "【1】" in produce_plan_name:
|
|
|
- obj["score"] = title_score_dict[obj["title"]] + 1000
|
|
|
- elif "【2】" in produce_plan_name:
|
|
|
- obj["score"] = title_score_dict[obj["title"]] + 100
|
|
|
- else:
|
|
|
- obj["score"] = title_score_dict[obj["title"]]
|
|
|
- result_list.append(obj)
|
|
|
-
|
|
|
- sorted_list = sorted(result_list, key=lambda x: x["score"], reverse=True)
|
|
|
- result = {
|
|
|
- "accountId": self.accountId,
|
|
|
- "accountName": self.accountName,
|
|
|
- "ghId": self.ghId,
|
|
|
- "strategy": self.strategy,
|
|
|
- "publishNum": self.publishNum,
|
|
|
- "rank_list": sorted_list[: self.publishNum],
|
|
|
- }
|
|
|
- except Exception as e:
|
|
|
- result = {
|
|
|
- "accountId": self.accountId,
|
|
|
- "accountName": self.accountName,
|
|
|
- "ghId": self.ghId,
|
|
|
- "strategy": self.strategy,
|
|
|
- "publishNum": self.publishNum,
|
|
|
- "rank_list": self.publishArticleList[: self.publishNum],
|
|
|
- }
|
|
|
- response = {"status": "Rank Success", "data": result, "code": 1}
|
|
|
- return response
|
|
|
+ ranked_1_d, ranked_2_d, ranked_3_d = await self.basic_rank()
|
|
|
+ ranked_1, ranked_2, ranked_3 = deduplication(ranked_1_d, ranked_2_d, ranked_3_d)
|
|
|
+ try:
|
|
|
+ L = []
|
|
|
+ if ranked_1:
|
|
|
+ L.append(ranked_1[0])
|
|
|
+ if ranked_2:
|
|
|
+ L.append(ranked_2[0])
|
|
|
+ else:
|
|
|
+ if ranked_2:
|
|
|
+ if len(ranked_2) > 1:
|
|
|
+ for i in ranked_2[:2]:
|
|
|
+ L.append(i)
|
|
|
+ else:
|
|
|
+ L.append(ranked_2[0])
|
|
|
+ for item in ranked_3:
|
|
|
+ L.append(item)
|
|
|
+
|
|
|
+ result = {
|
|
|
+ "accountId": self.accountId,
|
|
|
+ "accountName": self.accountName,
|
|
|
+ "ghId": self.ghId,
|
|
|
+ "strategy": self.strategy,
|
|
|
+ "publishNum": self.publishNum,
|
|
|
+ "rank_list": L[:self.publishNum],
|
|
|
+ "filter_list": self.filter_list
|
|
|
+ }
|
|
|
+ self.logger.log(
|
|
|
+ code=1006,
|
|
|
+ msg="rank successfully",
|
|
|
+ data=result
|
|
|
+ )
|
|
|
+ response = {"status": "Rank Success", "data": result, "code": 1}
|
|
|
+ except Exception as e:
|
|
|
+ result = {
|
|
|
+ "accountId": self.accountId,
|
|
|
+ "accountName": self.accountName,
|
|
|
+ "ghId": self.ghId,
|
|
|
+ "strategy": self.strategy,
|
|
|
+ "publishNum": self.publishNum,
|
|
|
+ "rank_list": self.publishArticleList[: self.publishNum],
|
|
|
+ "filter_list": self.filter_list
|
|
|
+ }
|
|
|
+ self.logger.log(
|
|
|
+ code=1007,
|
|
|
+ msg="rank failed because of {}".format(e),
|
|
|
+ data=result
|
|
|
+ )
|
|
|
+ response = {"status": "Rank Fail", "data": result, "code": 1}
|
|
|
+ return response
|
|
|
+ except:
|
|
|
+ result = {"code": 2, "info": "account is not exist"}
|
|
|
+ return result
|
|
|
|
|
|
async def rank_v2(self):
|
|
|
"""
|
|
@@ -149,14 +286,34 @@ class AccountArticleRank(object):
|
|
|
"""
|
|
|
match self.strategy:
|
|
|
case "ArticleRankV1":
|
|
|
+ self.logger.log(
|
|
|
+ code="1003",
|
|
|
+ msg="命中排序策略1"
|
|
|
+ )
|
|
|
return await self.rank_v1()
|
|
|
case "ArticleRankV2":
|
|
|
+ self.logger.log(
|
|
|
+ code="1003",
|
|
|
+ msg="命中排序策略2"
|
|
|
+ )
|
|
|
return await self.rank_v2()
|
|
|
case "ArticleRankV3":
|
|
|
+ self.logger.log(
|
|
|
+ code="1003",
|
|
|
+ msg="命中排序策略3"
|
|
|
+ )
|
|
|
return await self.rank_v3()
|
|
|
case "ArticleRankV4":
|
|
|
+ self.logger.log(
|
|
|
+ code="1003",
|
|
|
+ msg="命中排序策略4"
|
|
|
+ )
|
|
|
return await self.rank_v4()
|
|
|
case "ArticleRankV5":
|
|
|
+ self.logger.log(
|
|
|
+ code="1003",
|
|
|
+ msg="命中排序策略5"
|
|
|
+ )
|
|
|
return await self.rank_v5()
|
|
|
|
|
|
async def deal(self):
|
|
@@ -168,11 +325,4 @@ class AccountArticleRank(object):
|
|
|
if error_params:
|
|
|
return error_params
|
|
|
else:
|
|
|
- try:
|
|
|
- self.title_list = [
|
|
|
- i for i in self.title_list if not has_same(i, self.accountName)
|
|
|
- ]
|
|
|
- return await self.choose_strategy()
|
|
|
- except Exception as e:
|
|
|
- result = {"code": 2, "info": "account is not exist"}
|
|
|
- return result
|
|
|
+ return await self.choose_strategy()
|