Browse Source

Merge branch '2025-01-05-use-nlp-similarity' of Server/title_with_video into master

luojunhui 3 months ago
parent
commit
53bdb70be0

+ 7 - 1
applications/config/__init__.py

@@ -37,4 +37,10 @@ moon_shot = {
     "api_key": "sk-5DqYCa88kche6nwIWjLE1p4oMm8nXrR9kQMKbBolNAWERu7q",
     "model": "moonshot-v1-32k",
     "base_url": "https://api.moonshot.cn/v1"
-}
+}
+
+# nlp_server_local_machine_url
+nlp_base_url = 'http://61.48.133.26:6060/nlp'
+
+# nlp_server_aliyun_machine_url
+nlp_aliyun_url = 'http://47.98.136.48:6060/nlp'

+ 10 - 0
applications/const/server_const.py

@@ -62,5 +62,15 @@ class ServerConst:
     TASK_FAIL_CODE = 99
     TASK_PROCESSING_CODE = 101
 
+    # 相关性过滤阈值
+    NLP_SIMILARITY_THRESHOLD = 0.45
+
+    JCD_SIMILARITY_THRESHOLD = 0
+
+    # 视频时长(s)
+    MAX_VIDEO_DURATION = 300
+
+
+
 
 

+ 2 - 1
applications/match_algorithm/__init__.py

@@ -1,4 +1,5 @@
 """
 @author: luojunhui
 匹配算法
-"""
+"""
+from .rank import title_similarity_with_nlp

+ 51 - 0
applications/match_algorithm/rank.py

@@ -1,7 +1,13 @@
 """
 @author: luojunhui
 """
+from typing import Dict
+
+from applications.config import nlp_base_url, nlp_aliyun_url
 from applications.match_algorithm.title_similarity import jcd_title_similarity
+from applications.match_algorithm.title_similarity import nlp_title_similarity
+
+empty_list = []
 
 
 def jac_score(d1, d2):
@@ -51,3 +57,48 @@ def title_similarity_rank(content_title, recall_list):
         include_title_list.append(item)
     sorted_list = sorted(include_title_list, key=lambda x: x['score'], reverse=True)
     return sorted_list
+
+
+async def title_similarity_with_nlp(content_title, recall_list) -> Dict:
+    """
+    通过相关性模型来计算文章标题和搜索标题之间的相关性
+    """
+    title_list = [i['title'] for i in recall_list]
+    score_list = await nlp_title_similarity(
+        url=nlp_base_url,
+        ori_title=content_title,
+        search_title_list=title_list
+    )
+    # if local machine is down, use aliyun machine
+    if not score_list:
+        score_list = await nlp_title_similarity(
+            url=nlp_aliyun_url,
+            ori_title=content_title,
+            search_title_list=title_list
+        )
+
+    # check whether score_list exist
+    if score_list:
+        sorted_list = sorted(
+            (
+                {**item, 'score': score}
+                for item, score in zip(recall_list, score_list)
+            ),
+            key=lambda x: x['score'],
+            reverse=True
+        )
+        response = {
+            "alg": "nlp",
+            "result": sorted_list
+        }
+    else:
+        response = {
+            "result": empty_list
+        }
+
+    return response
+
+
+
+
+

+ 27 - 0
applications/match_algorithm/title_similarity.py

@@ -1,6 +1,9 @@
 """
 @author: luojunhui
 """
+import aiohttp
+
+empty_list = []
 
 
 def jcd_title_similarity(ori_title, search_title):
@@ -15,3 +18,27 @@ def jcd_title_similarity(ori_title, search_title):
     intersection = len(set1 & set2)
     union = len(set1 | set2)
     return intersection / union
+
+
+async def nlp_title_similarity(url, ori_title, search_title_list):
+    """
+    nlp title similarity
+    """
+    headers = {"Content-Type": "application/json"}
+    body = {
+        "data": {
+            "text_list_a": [ori_title],
+            "text_list_b": search_title_list,
+        },
+        "function": "similarities_cross",
+        "use_cache": False
+    }
+    async with aiohttp.ClientSession() as session:
+        async with session.post(url, headers=headers, json=body) as response:
+            response_text = await response.text()
+            if response_text and response.status == 200:
+                res = await response.json()
+                score_list = res['score_list_list'][0]
+                return score_list
+            else:
+                return empty_list

+ 2 - 1
applications/search/dy_search.py

@@ -7,6 +7,7 @@ import requests
 
 from applications.functions.common import sensitive_flag
 from applications.log import logging
+from applications.const import server_const
 
 
 def douyin_search(keyword, sensitive_words, trace_id):
@@ -44,7 +45,7 @@ def douyin_search(keyword, sensitive_words, trace_id):
                 title = obj['video_desc']
                 video_id = obj['video_id']
                 duration = int(obj['duration'])
-                if sensitive_flag(sensitive_words, title) and duration < 30000:
+                if sensitive_flag(sensitive_words, title) and duration < server_const.MAX_VIDEO_DURATION * 1000:
                     res = douyin_detail(video_id)
                     if res:
                         L.append(res)

+ 2 - 2
applications/search/hksp_search.py

@@ -13,6 +13,7 @@ from fake_useragent import FakeUserAgent
 
 from applications.functions.common import sensitive_flag
 from applications.log import logging
+from applications.const import server_const
 
 
 def tunnel_proxies():
@@ -117,10 +118,9 @@ def hksp_search(key, sensitive_words, trace_id):
                 video_id = data['vid']
                 title = data['title']
                 duration = int(data['duration'].split(":")[0]) * 60 + int(data['duration'].split(":")[1])
-                if sensitive_flag(sensitive_words, title) and int(duration) <= 300:
+                if sensitive_flag(sensitive_words, title) and int(duration) <= server_const.MAX_VIDEO_DURATION:
                     res = get_video_detail(video_id)
                     L.append(res)
-                    return L
                 else:
                     continue
             except Exception as e:

+ 24 - 5
applications/spider/__init__.py

@@ -3,14 +3,17 @@
 """
 from datetime import datetime
 
+from applications.feishu import bot
+from applications.const import server_const
 from applications.functions.video_item import VideoProducer
 from applications.log import logging
-from applications.match_algorithm.rank import title_similarity_rank
+from applications.match_algorithm import title_similarity_with_nlp
 from .spiderAB import SearchABTest
 from .spiderSchedule import SearchMethod
 
 
-async def save_video_to_mysql(video_obj, user, trace_id, platform, content_id, crawler_video_table, db_client, similarity_score):
+async def save_video_to_mysql(video_obj, user, trace_id, platform, content_id, crawler_video_table, db_client,
+                              similarity_score):
     """
     异步处理微信 video_obj
     公众号和站内账号一一对应
@@ -94,7 +97,7 @@ async def search_videos_from_web(info, gh_id_map, db_client):
     trace_id = info['trace_id']
     gh_id = info['gh_id']
     content_id = info['content_id']
-    recall_list = await search_AB.ab_5()
+    recall_list = await search_AB.ab_6()
     logging(
         code="1006",
         info="搜索到{}条视频".format(len(recall_list)),
@@ -102,12 +105,28 @@ async def search_videos_from_web(info, gh_id_map, db_client):
         trace_id=info['trace_id']
     )
     # 按照标题相似度排序
-    ranked_list = title_similarity_rank(content_title=info['ori_title'].split("@@")[-1], recall_list=recall_list)
+    ranked_result = await title_similarity_with_nlp(content_title=info['ori_title'].split("@@")[-1],
+                                                    recall_list=recall_list)
+    ranked_list = ranked_result['result']
+    if recall_list and not ranked_list:
+        bot(
+            title="NLP服务请求失败",
+            detail={
+                "trace_id": info['trace_id']
+            },
+            mention=False
+        )
+
     for recall_obj in ranked_list:
         if recall_obj:
             platform = recall_obj['platform']
             recall_video = recall_obj['result']
             score = recall_obj['score']
+
+            # 过滤掉nlp分低于0.45的
+            if score < server_const.NLP_SIMILARITY_THRESHOLD:
+                continue
+
             if recall_video:
                 await save_video_to_mysql(
                     video_obj=recall_video,
@@ -119,4 +138,4 @@ async def search_videos_from_web(info, gh_id_map, db_client):
                     db_client=db_client,
                     similarity_score=score
                 )
-    return len(ranked_list)
+    return len(ranked_list)

+ 24 - 1
applications/spider/spiderAB.py

@@ -122,7 +122,7 @@ class SearchABTest(object):
             text=self.article_summary[:15],
             trace_id=self.trace_id
         )
-        if len(result_list) > 3:
+        if len(result_list) > 5:
             return result_list
         else:
             result_list += await self.search_method.search_v2(
@@ -151,3 +151,26 @@ class SearchABTest(object):
                             trace_id=self.trace_id
                         )
                         return result_list
+
+    async def ab_6(self):
+        """
+        依次搜索
+        """
+        search_result_by_summary = await self.search_method.search_v2(
+            text=self.article_summary[:15],
+            trace_id=self.trace_id
+        )
+
+        search_result_by_ori_title = await self.search_method.search_v2(
+            text=self.ori_title[:15],
+            trace_id=self.trace_id
+        )
+
+        search_result_by_article_keys = await self.search_method.search_v2(
+            text=",".join(self.article_keys),
+            trace_id=self.trace_id
+        )
+
+        return search_result_by_summary + search_result_by_ori_title + search_result_by_article_keys
+
+

+ 5 - 6
applications/spider/spiderSchedule.py

@@ -46,13 +46,12 @@ class SearchMethod(object):
         douyin_result = douyin_search(keyword=text, sensitive_words=cls.s_words, trace_id=trace_id)
         for vid_obj in douyin_result:
             L.append({"platform": "dy_search", "result": vid_obj})
-        if len(L) >= 3:
-            return L
         else:
             baidu_result = hksp_search(key=text, sensitive_words=cls.s_words, trace_id=trace_id)
             if baidu_result:
-                L.append({"platform": "baidu_search", "result": baidu_result[0]})
-            xigua_result = xigua_search_v2(keyword=text, sensitive_words=cls.s_words)
-            if xigua_result:
-                L.append({"platform": "xg_search", "result": xigua_result[0]})
+                for baidu_obj in baidu_result:
+                    L.append({"platform": "baidu_search", "result": baidu_obj})
+            # xigua_result = xigua_search_v2(keyword=text, sensitive_words=cls.s_words)
+            # if xigua_result:
+            #     L.append({"platform": "xg_search", "result": xigua_result[0]})
             return L