il y a 5 mois · 53bdb70be0
--- a/applications/config/__init__.py
+++ b/applications/config/__init__.py
@@ -37,4 +37,10 @@ moon_shot = {
 
				     "api_key": "sk-5DqYCa88kche6nwIWjLE1p4oMm8nXrR9kQMKbBolNAWERu7q",
			
 
				     "model": "moonshot-v1-32k",
			
 
				     "base_url": "https://api.moonshot.cn/v1"
			
 
				-}
			
 
				+}
			
 
				+
			
 
				+# nlp_server_local_machine_url
			
 
				+nlp_base_url = 'http://61.48.133.26:6060/nlp'
			
 
				+
			
 
				+# nlp_server_aliyun_machine_url
			
 
				+nlp_aliyun_url = 'http://47.98.136.48:6060/nlp'
			
--- a/applications/const/server_const.py
+++ b/applications/const/server_const.py
@@ -62,5 +62,15 @@ class ServerConst:
 
				     TASK_FAIL_CODE = 99
			
 
				     TASK_PROCESSING_CODE = 101
			
 
				 
			
 
				+    # 相关性过滤阈值
			
 
				+    NLP_SIMILARITY_THRESHOLD = 0.45
			
 
				+
			
 
				+    JCD_SIMILARITY_THRESHOLD = 0
			
 
				+
			
 
				+    # 视频时长（s)
			
 
				+    MAX_VIDEO_DURATION = 300
			
 
				+
			
 
				+
			
 
				+
			
 
				 
			
 
				 
			
--- a/applications/match_algorithm/__init__.py
+++ b/applications/match_algorithm/__init__.py
@@ -1,4 +1,5 @@
 
				 """
			
 
				 @author: luojunhui
			
 
				 匹配算法
			
 
				-"""
			
 
				+"""
			
 
				+from .rank import title_similarity_with_nlp
			
--- a/applications/match_algorithm/rank.py
+++ b/applications/match_algorithm/rank.py
@@ -1,7 +1,13 @@
 
				 """
			
 
				 @author: luojunhui
			
 
				 """
			
 
				+from typing import Dict
			
 
				+
			
 
				+from applications.config import nlp_base_url, nlp_aliyun_url
			
 
				 from applications.match_algorithm.title_similarity import jcd_title_similarity
			
 
				+from applications.match_algorithm.title_similarity import nlp_title_similarity
			
 
				+
			
 
				+empty_list = []
			
 
				 
			
 
				 
			
 
				 def jac_score(d1, d2):
			
@@ -51,3 +57,48 @@ def title_similarity_rank(content_title, recall_list):
 
				         include_title_list.append(item)
			
 
				     sorted_list = sorted(include_title_list, key=lambda x: x['score'], reverse=True)
			
 
				     return sorted_list
			
 
				+
			
 
				+
			
 
				+async def title_similarity_with_nlp(content_title, recall_list) -> Dict:
			
 
				+    """
			
 
				+    通过相关性模型来计算文章标题和搜索标题之间的相关性
			
 
				+    """
			
 
				+    title_list = [i['title'] for i in recall_list]
			
 
				+    score_list = await nlp_title_similarity(
			
 
				+        url=nlp_base_url,
			
 
				+        ori_title=content_title,
			
 
				+        search_title_list=title_list
			
 
				+    )
			
 
				+    # if local machine is down, use aliyun machine
			
 
				+    if not score_list:
			
 
				+        score_list = await nlp_title_similarity(
			
 
				+            url=nlp_aliyun_url,
			
 
				+            ori_title=content_title,
			
 
				+            search_title_list=title_list
			
 
				+        )
			
 
				+
			
 
				+    # check whether score_list exist
			
 
				+    if score_list:
			
 
				+        sorted_list = sorted(
			
 
				+            (
			
 
				+                {**item, 'score': score}
			
 
				+                for item, score in zip(recall_list, score_list)
			
 
				+            ),
			
 
				+            key=lambda x: x['score'],
			
 
				+            reverse=True
			
 
				+        )
			
 
				+        response = {
			
 
				+            "alg": "nlp",
			
 
				+            "result": sorted_list
			
 
				+        }
			
 
				+    else:
			
 
				+        response = {
			
 
				+            "result": empty_list
			
 
				+        }
			
 
				+
			
 
				+    return response
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
--- a/applications/match_algorithm/title_similarity.py
+++ b/applications/match_algorithm/title_similarity.py
@@ -1,6 +1,9 @@
 
				 """
			
 
				 @author: luojunhui
			
 
				 """
			
 
				+import aiohttp
			
 
				+
			
 
				+empty_list = []
			
 
				 
			
 
				 
			
 
				 def jcd_title_similarity(ori_title, search_title):
			
@@ -15,3 +18,27 @@ def jcd_title_similarity(ori_title, search_title):
 
				     intersection = len(set1 & set2)
			
 
				     union = len(set1 | set2)
			
 
				     return intersection / union
			
 
				+
			
 
				+
			
 
				+async def nlp_title_similarity(url, ori_title, search_title_list):
			
 
				+    """
			
 
				+    nlp title similarity
			
 
				+    """
			
 
				+    headers = {"Content-Type": "application/json"}
			
 
				+    body = {
			
 
				+        "data": {
			
 
				+            "text_list_a": [ori_title],
			
 
				+            "text_list_b": search_title_list,
			
 
				+        },
			
 
				+        "function": "similarities_cross",
			
 
				+        "use_cache": False
			
 
				+    }
			
 
				+    async with aiohttp.ClientSession() as session:
			
 
				+        async with session.post(url, headers=headers, json=body) as response:
			
 
				+            response_text = await response.text()
			
 
				+            if response_text and response.status == 200:
			
 
				+                res = await response.json()
			
 
				+                score_list = res['score_list_list'][0]
			
 
				+                return score_list
			
 
				+            else:
			
 
				+                return empty_list
			
--- a/applications/search/dy_search.py
+++ b/applications/search/dy_search.py
@@ -7,6 +7,7 @@ import requests
 
				 
			
 
				 from applications.functions.common import sensitive_flag
			
 
				 from applications.log import logging
			
 
				+from applications.const import server_const
			
 
				 
			
 
				 
			
 
				 def douyin_search(keyword, sensitive_words, trace_id):
			
@@ -44,7 +45,7 @@ def douyin_search(keyword, sensitive_words, trace_id):
 
				                 title = obj['video_desc']
			
 
				                 video_id = obj['video_id']
			
 
				                 duration = int(obj['duration'])
			
 
				-                if sensitive_flag(sensitive_words, title) and duration < 30000:
			
 
				+                if sensitive_flag(sensitive_words, title) and duration < server_const.MAX_VIDEO_DURATION * 1000:
			
 
				                     res = douyin_detail(video_id)
			
 
				                     if res:
			
 
				                         L.append(res)
			
--- a/applications/search/hksp_search.py
+++ b/applications/search/hksp_search.py
@@ -13,6 +13,7 @@ from fake_useragent import FakeUserAgent
 
				 
			
 
				 from applications.functions.common import sensitive_flag
			
 
				 from applications.log import logging
			
 
				+from applications.const import server_const
			
 
				 
			
 
				 
			
 
				 def tunnel_proxies():
			
@@ -117,10 +118,9 @@ def hksp_search(key, sensitive_words, trace_id):
 
				                 video_id = data['vid']
			
 
				                 title = data['title']
			
 
				                 duration = int(data['duration'].split(":")[0]) * 60 + int(data['duration'].split(":")[1])
			
 
				-                if sensitive_flag(sensitive_words, title) and int(duration) <= 300:
			
 
				+                if sensitive_flag(sensitive_words, title) and int(duration) <= server_const.MAX_VIDEO_DURATION:
			
 
				                     res = get_video_detail(video_id)
			
 
				                     L.append(res)
			
 
				-                    return L
			
 
				                 else:
			
 
				                     continue
			
 
				             except Exception as e:
			
--- a/applications/spider/__init__.py
+++ b/applications/spider/__init__.py
@@ -3,14 +3,17 @@
 
				 """
			
 
				 from datetime import datetime
			
 
				 
			
 
				+from applications.feishu import bot
			
 
				+from applications.const import server_const
			
 
				 from applications.functions.video_item import VideoProducer
			
 
				 from applications.log import logging
			
 
				-from applications.match_algorithm.rank import title_similarity_rank
			
 
				+from applications.match_algorithm import title_similarity_with_nlp
			
 
				 from .spiderAB import SearchABTest
			
 
				 from .spiderSchedule import SearchMethod
			
 
				 
			
 
				 
			
 
				-async def save_video_to_mysql(video_obj, user, trace_id, platform, content_id, crawler_video_table, db_client, similarity_score):
			
 
				+async def save_video_to_mysql(video_obj, user, trace_id, platform, content_id, crawler_video_table, db_client,
			
 
				+                              similarity_score):
			
 
				     """
			
 
				     异步处理微信 video_obj
			
 
				     公众号和站内账号一一对应
			
@@ -94,7 +97,7 @@ async def search_videos_from_web(info, gh_id_map, db_client):
 
				     trace_id = info['trace_id']
			
 
				     gh_id = info['gh_id']
			
 
				     content_id = info['content_id']
			
 
				-    recall_list = await search_AB.ab_5()
			
 
				+    recall_list = await search_AB.ab_6()
			
 
				     logging(
			
 
				         code="1006",
			
 
				         info="搜索到{}条视频".format(len(recall_list)),
			
@@ -102,12 +105,28 @@ async def search_videos_from_web(info, gh_id_map, db_client):
 
				         trace_id=info['trace_id']
			
 
				     )
			
 
				     # 按照标题相似度排序
			
 
				-    ranked_list = title_similarity_rank(content_title=info['ori_title'].split("@@")[-1], recall_list=recall_list)
			
 
				+    ranked_result = await title_similarity_with_nlp(content_title=info['ori_title'].split("@@")[-1],
			
 
				+                                                    recall_list=recall_list)
			
 
				+    ranked_list = ranked_result['result']
			
 
				+    if recall_list and not ranked_list:
			
 
				+        bot(
			
 
				+            title="NLP服务请求失败",
			
 
				+            detail={
			
 
				+                "trace_id": info['trace_id']
			
 
				+            },
			
 
				+            mention=False
			
 
				+        )
			
 
				+
			
 
				     for recall_obj in ranked_list:
			
 
				         if recall_obj:
			
 
				             platform = recall_obj['platform']
			
 
				             recall_video = recall_obj['result']
			
 
				             score = recall_obj['score']
			
 
				+
			
 
				+            # 过滤掉nlp分低于0.45的
			
 
				+            if score < server_const.NLP_SIMILARITY_THRESHOLD:
			
 
				+                continue
			
 
				+
			
 
				             if recall_video:
			
 
				                 await save_video_to_mysql(
			
 
				                     video_obj=recall_video,
			
@@ -119,4 +138,4 @@ async def search_videos_from_web(info, gh_id_map, db_client):
 
				                     db_client=db_client,
			
 
				                     similarity_score=score
			
 
				                 )
			
 
				-    return len(ranked_list)
			
 
				+    return len(ranked_list)
			
--- a/applications/spider/spiderAB.py
+++ b/applications/spider/spiderAB.py
@@ -122,7 +122,7 @@ class SearchABTest(object):
 
				             text=self.article_summary[:15],
			
 
				             trace_id=self.trace_id
			
 
				         )
			
 
				-        if len(result_list) > 3:
			
 
				+        if len(result_list) > 5:
			
 
				             return result_list
			
 
				         else:
			
 
				             result_list += await self.search_method.search_v2(
			
@@ -151,3 +151,26 @@ class SearchABTest(object):
 
				                             trace_id=self.trace_id
			
 
				                         )
			
 
				                         return result_list
			
 
				+
			
 
				+    async def ab_6(self):
			
 
				+        """
			
 
				+        依次搜索
			
 
				+        """
			
 
				+        search_result_by_summary = await self.search_method.search_v2(
			
 
				+            text=self.article_summary[:15],
			
 
				+            trace_id=self.trace_id
			
 
				+        )
			
 
				+
			
 
				+        search_result_by_ori_title = await self.search_method.search_v2(
			
 
				+            text=self.ori_title[:15],
			
 
				+            trace_id=self.trace_id
			
 
				+        )
			
 
				+
			
 
				+        search_result_by_article_keys = await self.search_method.search_v2(
			
 
				+            text=",".join(self.article_keys),
			
 
				+            trace_id=self.trace_id
			
 
				+        )
			
 
				+
			
 
				+        return search_result_by_summary + search_result_by_ori_title + search_result_by_article_keys
			
 
				+
			
 
				+
			
--- a/applications/spider/spiderSchedule.py
+++ b/applications/spider/spiderSchedule.py
@@ -46,13 +46,12 @@ class SearchMethod(object):
 
				         douyin_result = douyin_search(keyword=text, sensitive_words=cls.s_words, trace_id=trace_id)
			
 
				         for vid_obj in douyin_result:
			
 
				             L.append({"platform": "dy_search", "result": vid_obj})
			
 
				-        if len(L) >= 3:
			
 
				-            return L
			
 
				         else:
			
 
				             baidu_result = hksp_search(key=text, sensitive_words=cls.s_words, trace_id=trace_id)
			
 
				             if baidu_result:
			
 
				-                L.append({"platform": "baidu_search", "result": baidu_result[0]})
			
 
				-            xigua_result = xigua_search_v2(keyword=text, sensitive_words=cls.s_words)
			
 
				-            if xigua_result:
			
 
				-                L.append({"platform": "xg_search", "result": xigua_result[0]})
			
 
				+                for baidu_obj in baidu_result:
			
 
				+                    L.append({"platform": "baidu_search", "result": baidu_obj})
			
 
				+            # xigua_result = xigua_search_v2(keyword=text, sensitive_words=cls.s_words)
			
 
				+            # if xigua_result:
			
 
				+            #     L.append({"platform": "xg_search", "result": xigua_result[0]})
			
 
				             return L