瀏覽代碼

2024-07-01
标题相似度-v1

罗俊辉 9 月之前
父節點
當前提交
daecbee549

+ 29 - 1
applications/match_algorithm/rank.py

@@ -1,7 +1,7 @@
 """
 @author: luojunhui
 """
-from applications.match_algorithm.recall import recall_videos
+from applications.match_algorithm.title_similarity import jcd_title_similarity
 from applications.functions.log import logging
 
 
@@ -29,3 +29,31 @@ def jac_score(d1, d2):
     score_3 = len(extra_keys_intersection) / len(extra_keys_union)
     return score_1 * 0.4 + score_2 * 0.4 + score_3 * 0.2, d2['video_id']
 
+
+def title_similarity_rank(content_title, recall_list):
+    """
+
+    :param content_title:
+    :param recall_list:
+    :return:
+    """
+    include_title_list = []
+    for item in recall_list:
+        video_info = item['result']
+        platform = item['platform']
+        if platform in ['dy_search', 'baidu_search']:
+            title = video_info['title']
+        elif platform in ['xg_search']:
+            title = video_info['video_title']
+        else:
+            continue
+        item['title'] = title
+        include_title_list.append(item)
+    # include_title_list加上相似度分
+    title_score_list = [
+        {'score': jcd_title_similarity(content_title, item['title']), **item}
+        for item in
+        include_title_list
+    ]
+    sorted_list = sorted(title_score_list, key=lambda x: x['score'], reverse=True)
+    return sorted_list

+ 17 - 0
applications/match_algorithm/title_similarity.py

@@ -0,0 +1,17 @@
+"""
+@author: luojunhui
+"""
+
+
+def jcd_title_similarity(ori_title, search_title):
+    """
+    simple ways to calculate the similarity of titles
+    :param ori_title:
+    :param search_title:
+    :return:
+    """
+    set1 = set(ori_title)
+    set2 = set(search_title)
+    intersection = len(set1 & set2)
+    union = len(set1 | set2)
+    return intersection / union

+ 4 - 2
applications/schedule/search_schedule.py

@@ -5,11 +5,11 @@
 import json
 import time
 
+from applications.match_algorithm.rank import title_similarity_rank
 from applications.search import *
 from applications.static.config import gh_id_dict, db_article
 from applications.functions.log import logging
 from applications.functions.video_item import VideoProducer
-from applications.functions.mysql import select_sensitive_words
 from applications.functions.kimi import KimiServer
 from applications.functions.common import request_etl
 
@@ -324,8 +324,10 @@ async def search_videos(params, trace_id, gh_id, mysql_client):
         data=recall_list,
         trace_id=trace_id
     )
+    # 按照标题相似度排序
+    ranked_list = title_similarity_rank(content_title=content_title, recall_list=recall_list)
     index = 0
-    for recall_obj in recall_list:
+    for recall_obj in ranked_list:
         if recall_obj:
             platform = recall_obj['platform']
             recall_video = recall_obj['result']