罗俊辉 пре 10 месеци
родитељ
комит
e2ead24c33
5 измењених фајлова са 77 додато и 23 уклоњено
  1. 1 1
      applications/aliyunLog.py
  2. 45 14
      applications/pipeline.py
  3. 11 4
      routes/AccountArticleRank.py
  4. 12 1
      routes/__init__.py
  5. 8 3
      test/rank_dev.py

+ 1 - 1
applications/aliyunLog.py

@@ -58,7 +58,7 @@ class AliyunArticleLog(object):
             client.put_logs(request)
         except Exception as e:
             print("日志失败")
-            print(e)
+            # print(e)
 
 
 

+ 45 - 14
applications/pipeline.py

@@ -1,10 +1,12 @@
 """
 @author: luojunhui
 """
+import time
+
 import requests
 
 from applications.functions import title_sim_v2_by_list
-from applications.functions import get_article_title_url_list, get_article_titles
+from applications.functions import get_article_titles
 
 
 class LongArticlesPipeline(object):
@@ -13,23 +15,38 @@ class LongArticlesPipeline(object):
     """
 
     @classmethod
-    def history_exists(cls, title, account_nickname, plan_name):
+    def history_title(cls, account_nickname):
         """
         判断是否存储
-        :param plan_name:
-        :param title:
         :param account_nickname:
         :return:
         """
-        if "【1】" in plan_name or "【2】" in plan_name:
-            index_list = [1, 2]
-        else:
-            index_list = [1, 2, 3, 4, 5, 6, 7, 8]
-        account_title_list = get_article_titles(
+        # if "【1】" in plan_name or "【2】" in plan_name:
+        index_list_1 = [1, 2]
+        index_list_2 = [1, 2, 3, 4, 5, 6, 7, 8]
+        account_title_list_1 = get_article_titles(
             account_nickname,
-            index_list=index_list
+            index_list=index_list_1
         )
-        print(account_title_list)
+        account_title_list_2 = get_article_titles(
+            account_nickname,
+            index_list=index_list_2
+        )
+        res = {
+            "rule_1": account_title_list_1,
+            "rule_2": account_title_list_2
+
+        }
+        return res
+
+    @classmethod
+    def history_exists(cls, title, account_title_list):
+        """
+        判断文章是否历史已发布
+        :param title:
+        :param account_title_list:
+        :return:
+        """
         sim_res = title_sim_v2_by_list(title, account_title_list)
         if sim_res:
             return True
@@ -77,16 +94,20 @@ class LongArticlesPipeline(object):
         return response.json()['is_bad']
 
     @classmethod
-    def deal(cls, article_obj, account_name):
+    def deal(cls, article_obj, account_name, history_title_dict):
         """
+        :param history_title_dict:
         :param account_name:
         :param article_obj:
         :return:
         """
+        a = time.time()
         article_bad_flag = cls.article_bad(
             title=article_obj['title'],
             account_nickname=account_name
         )
+        b = time.time()
+        print("历史低质量文章:", b - a)
         if article_bad_flag:
             response = {
                 "filterReason": "历史表现差的文章",
@@ -94,11 +115,18 @@ class LongArticlesPipeline(object):
             }
             return response
         else:
+            c = time.time()
+            plan_name = article_obj['producePlanName']
+            if "【1】" in plan_name or "【2】" in plan_name:
+                history_title_list = history_title_dict['rule_1']
+            else:
+                history_title_list = history_title_dict['rule_2']
             history_exists_flag = cls.history_exists(
                 title=article_obj['title'],
-                account_nickname=account_name,
-                plan_name=article_obj['producePlanName']
+                account_title_list=history_title_list
             )
+            d = time.time()
+            print("历史已经发布文章:", d - c)
             if history_exists_flag:
                 response = {
                     "filterReason": "历史已发布文章",
@@ -106,7 +134,10 @@ class LongArticlesPipeline(object):
                 }
                 return response
             else:
+                e = time.time()
                 safe_flag = cls.article_safe(title=article_obj['title'])
+                f = time.time()
+                print("安全:", f - e)
                 if safe_flag:
                     response = {
                         "filterReason": "安全违规",

+ 11 - 4
routes/AccountArticleRank.py

@@ -1,10 +1,10 @@
 """
 @author: luojunhui
 """
-import json
 import time
 
 from uuid import uuid4
+from tqdm import tqdm
 
 from applications.aliyunLog import AliyunArticleLog
 from applications.functions import ArticleRank, title_sim_v2_by_list
@@ -24,7 +24,7 @@ def deduplication(rank1, rank2, rank3):
             for item in item_list:
                 title = item['title']
                 if title_sim_v2_by_list(title, dup_list):
-                    print("标题重复,已经过滤\t", title)
+                    # print("标题重复,已经过滤\t", title)
                     continue
                 else:
                     result.append(item)
@@ -62,13 +62,15 @@ class AccountArticleRank(object):
         """
         self.publishArticleList = []
         self.filter_list = []
-        for item in self.params['publishArticleList']:
-            flag = self.pipeline.deal(item, self.accountName)
+        history_title_dict = self.pipeline.history_title(account_nickname=self.accountName)
+        for item in tqdm(self.params['publishArticleList']):
+            flag = self.pipeline.deal(item, self.accountName, history_title_dict)
             if flag:
                 item['filterReason'] = flag['filterReason']
                 self.filter_list.append(item)
             else:
                 self.publishArticleList.append(item)
+        print("过滤完成")
 
     async def check_params(self):
         """
@@ -81,6 +83,7 @@ class AccountArticleRank(object):
             self.ghId = self.params["ghId"]
             self.strategy = self.params["strategy"]
             self.publishNum = self.params["publishNum"]
+            print("开始校验参数")
             self.filter()
             self.logger.log(
                 code="1001",
@@ -196,9 +199,11 @@ class AccountArticleRank(object):
         Rank Version 1
         :return:
         """
+        print("开始排序")
         try:
             ranked_1_d, ranked_2_d, ranked_3_d = await self.basic_rank()
             ranked_1, ranked_2, ranked_3 = deduplication(ranked_1_d, ranked_2_d, ranked_3_d)
+            print("去重成功")
             try:
                 L = []
                 if ranked_1:
@@ -245,6 +250,7 @@ class AccountArticleRank(object):
                     msg="rank failed because of {}".format(e),
                     data=result
                 )
+                print("排序成功")
                 response = {"status": "Rank Fail", "data": result, "code": 1}
             return response
         except:
@@ -325,4 +331,5 @@ class AccountArticleRank(object):
         if error_params:
             return error_params
         else:
+            print("参数校验成功")
             return await self.choose_strategy()

+ 12 - 1
routes/__init__.py

@@ -33,7 +33,18 @@ def AlgRoutes(mysql_client):
         params = await request.get_json()
         AAR = AccountArticleRank(params, mysql_client=mysql_client)
         response = await AAR.deal()
-        print(response)
+        # print(response)
+        return jsonify(response)
+
+    @blueprint.route("/nlp", methods=["POST"])
+    async def nlper():
+        """
+        nlper ma
+        :return:
+        """
+        response = {
+            "msg": "this function is developing"
+        }
         return jsonify(response)
 
     return blueprint

Разлика између датотеке није приказан због своје велике величине
+ 8 - 3
test/rank_dev.py


Неке датотеке нису приказане због велике количине промена