rank.py 2.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990
  1. """
  2. @author: luojunhui
  3. """
  4. from typing import Dict
  5. from applications.match_algorithm.title_similarity import jcd_title_similarity
  6. from applications.match_algorithm.title_similarity import nlp_title_similarity
  7. def jac_score(d1, d2):
  8. """
  9. 通过交并集来判断
  10. :param d1:
  11. :param d2:
  12. :return:
  13. """
  14. f1_keys = set(d1["key_words"])
  15. f2_keys = set(d2["key_words"])
  16. keys_union = f1_keys | f2_keys
  17. keys_intersection = f1_keys & f2_keys
  18. f1_search_keys = set(d1["search_keys"])
  19. f2_search_keys = set(d2["search_keys"])
  20. search_keys_union = f1_search_keys | f2_search_keys
  21. search_keys_intersection = f1_search_keys & f2_search_keys
  22. f1_extra_keys = set(d1["extra_keys"])
  23. f2_extra_keys = set(d2["extra_keys"])
  24. extra_keys_union = f1_extra_keys | f2_extra_keys
  25. extra_keys_intersection = f1_extra_keys & f2_extra_keys
  26. score_1 = len(keys_intersection) / len(keys_union)
  27. score_2 = len(search_keys_intersection) / len(search_keys_union)
  28. score_3 = len(extra_keys_intersection) / len(extra_keys_union)
  29. return score_1 * 0.4 + score_2 * 0.4 + score_3 * 0.2, d2['video_id']
  30. def title_similarity_rank(content_title, recall_list):
  31. """
  32. :param content_title:
  33. :param recall_list:
  34. :return:
  35. """
  36. include_title_list = []
  37. for item in recall_list:
  38. video_info = item['result']
  39. platform = item['platform']
  40. if platform in ['dy_search', 'baidu_search']:
  41. title = video_info['title']
  42. elif platform in ['xg_search']:
  43. title = video_info['video_title']
  44. else:
  45. continue
  46. item['title'] = title
  47. item['score'] = jcd_title_similarity(content_title, title)
  48. include_title_list.append(item)
  49. sorted_list = sorted(include_title_list, key=lambda x: x['score'], reverse=True)
  50. return sorted_list
  51. async def title_similarity_with_nlp(content_title, recall_list) -> Dict:
  52. """
  53. 通过相关性模型来计算文章标题和搜索标题之间的相关性
  54. """
  55. title_list = [i['title'] for i in recall_list]
  56. score_list = await nlp_title_similarity(
  57. ori_title=content_title,
  58. search_title_list=title_list
  59. )
  60. if score_list:
  61. sorted_list = sorted(
  62. (
  63. {**item, 'score': score}
  64. for item, score in zip(recall_list, score_list)
  65. ),
  66. key=lambda x: x['score'],
  67. reverse=True
  68. )
  69. response = {
  70. "alg": "nlp",
  71. "result": sorted_list
  72. }
  73. else:
  74. # if nlp server is down, use jcd similarity instead
  75. response = {
  76. "alg": "jcd",
  77. "result": title_similarity_rank(content_title, recall_list)
  78. }
  79. return response