浏览代码

nlp取消jcd兜底
视频时长控制在300s

luojunhui 6 月之前
父节点
当前提交
3712ab6805

+ 7 - 1
applications/config/__init__.py

@@ -37,4 +37,10 @@ moon_shot = {
     "api_key": "sk-5DqYCa88kche6nwIWjLE1p4oMm8nXrR9kQMKbBolNAWERu7q",
     "model": "moonshot-v1-32k",
     "base_url": "https://api.moonshot.cn/v1"
-}
+}
+
+# nlp_server_local_machine_url
+nlp_base_url = 'http://61.48.133.26:6060/nlp'
+
+# nlp_server_aliyun_machine_url
+nlp_aliyun_url = 'http://47.98.136.48:6060/nlp'

+ 17 - 3
applications/match_algorithm/rank.py

@@ -3,9 +3,12 @@
 """
 from typing import Dict
 
+from applications.config import nlp_base_url, nlp_aliyun_url
 from applications.match_algorithm.title_similarity import jcd_title_similarity
 from applications.match_algorithm.title_similarity import nlp_title_similarity
 
+empty_list = []
+
 
 def jac_score(d1, d2):
     """
@@ -62,9 +65,19 @@ async def title_similarity_with_nlp(content_title, recall_list) -> Dict:
     """
     title_list = [i['title'] for i in recall_list]
     score_list = await nlp_title_similarity(
+        url=nlp_base_url,
         ori_title=content_title,
         search_title_list=title_list
     )
+    # if local machine is down, use aliyun machine
+    if not score_list:
+        score_list = await nlp_title_similarity(
+            url=nlp_aliyun_url,
+            ori_title=content_title,
+            search_title_list=title_list
+        )
+
+    # check whether score_list exist
     if score_list:
         sorted_list = sorted(
             (
@@ -79,12 +92,13 @@ async def title_similarity_with_nlp(content_title, recall_list) -> Dict:
             "result": sorted_list
         }
     else:
-        # if nlp server is down, use jcd similarity instead
         response = {
-            "alg": "jcd",
-            "result": title_similarity_rank(content_title, recall_list)
+            "result": empty_list
         }
+
     return response
 
 
 
+
+

+ 2 - 4
applications/match_algorithm/title_similarity.py

@@ -20,7 +20,7 @@ def jcd_title_similarity(ori_title, search_title):
     return intersection / union
 
 
-async def nlp_title_similarity(ori_title, search_title_list):
+async def nlp_title_similarity(url, ori_title, search_title_list):
     """
     nlp title similarity
     """
@@ -33,12 +33,10 @@ async def nlp_title_similarity(ori_title, search_title_list):
         "function": "similarities_cross",
         "use_cache": False
     }
-    url = 'http://61.48.133.26:6060/nlp'
-
     async with aiohttp.ClientSession() as session:
         async with session.post(url, headers=headers, json=body) as response:
             response_text = await response.text()
-            if response_text:
+            if response_text and response.status == 200:
                 res = await response.json()
                 score_list = res['score_list_list'][0]
                 return score_list

+ 1 - 1
applications/search/hksp_search.py

@@ -117,7 +117,7 @@ def hksp_search(key, sensitive_words, trace_id):
                 video_id = data['vid']
                 title = data['title']
                 duration = int(data['duration'].split(":")[0]) * 60 + int(data['duration'].split(":")[1])
-                if sensitive_flag(sensitive_words, title) and int(duration) <= 900:
+                if sensitive_flag(sensitive_words, title) and int(duration) <= 300:
                     res = get_video_detail(video_id)
                     L.append(res)
                 else:

+ 17 - 9
applications/spider/__init__.py

@@ -3,6 +3,7 @@
 """
 from datetime import datetime
 
+from applications.feishu import bot
 from applications.const import server_const
 from applications.functions.video_item import VideoProducer
 from applications.log import logging
@@ -11,7 +12,8 @@ from .spiderAB import SearchABTest
 from .spiderSchedule import SearchMethod
 
 
-async def save_video_to_mysql(video_obj, user, trace_id, platform, content_id, crawler_video_table, db_client, similarity_score):
+async def save_video_to_mysql(video_obj, user, trace_id, platform, content_id, crawler_video_table, db_client,
+                              similarity_score):
     """
     异步处理微信 video_obj
     公众号和站内账号一一对应
@@ -103,20 +105,26 @@ async def search_videos_from_web(info, gh_id_map, db_client):
         trace_id=info['trace_id']
     )
     # 按照标题相似度排序
-    ranked_result = await title_similarity_with_nlp(content_title=info['ori_title'].split("@@")[-1], recall_list=recall_list)
-    rank_alg = ranked_result['alg']
+    ranked_result = await title_similarity_with_nlp(content_title=info['ori_title'].split("@@")[-1],
+                                                    recall_list=recall_list)
     ranked_list = ranked_result['result']
+    if recall_list and not ranked_list:
+        bot(
+            title="NLP服务请求失败",
+            detail={
+                "trace_id": info['trace_id']
+            },
+            mention=False
+        )
+
     for recall_obj in ranked_list:
         if recall_obj:
             platform = recall_obj['platform']
             recall_video = recall_obj['result']
             score = recall_obj['score']
-            # 过滤掉jcd分数为0的
-            if rank_alg == 'jcd' and score == server_const.JCD_SIMILARITY_THRESHOLD:
-                continue
 
-            # 过滤掉nlp分低于0.3
-            if rank_alg == 'nlp' and score < server_const.NLP_SIMILARITY_THRESHOLD:
+            # 过滤掉nlp分低于0.45的
+            if score < server_const.NLP_SIMILARITY_THRESHOLD:
                 continue
 
             if recall_video:
@@ -130,4 +138,4 @@ async def search_videos_from_web(info, gh_id_map, db_client):
                     db_client=db_client,
                     similarity_score=score
                 )
-    return len(ranked_list)
+    return len(ranked_list)