rank.py 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104
  1. """
  2. @author: luojunhui
  3. """
  4. from typing import Dict
  5. from applications.config import nlp_base_url, nlp_aliyun_url
  6. from applications.match_algorithm.title_similarity import jcd_title_similarity
  7. from applications.match_algorithm.title_similarity import nlp_title_similarity
  8. empty_list = []
  9. def jac_score(d1, d2):
  10. """
  11. 通过交并集来判断
  12. :param d1:
  13. :param d2:
  14. :return:
  15. """
  16. f1_keys = set(d1["key_words"])
  17. f2_keys = set(d2["key_words"])
  18. keys_union = f1_keys | f2_keys
  19. keys_intersection = f1_keys & f2_keys
  20. f1_search_keys = set(d1["search_keys"])
  21. f2_search_keys = set(d2["search_keys"])
  22. search_keys_union = f1_search_keys | f2_search_keys
  23. search_keys_intersection = f1_search_keys & f2_search_keys
  24. f1_extra_keys = set(d1["extra_keys"])
  25. f2_extra_keys = set(d2["extra_keys"])
  26. extra_keys_union = f1_extra_keys | f2_extra_keys
  27. extra_keys_intersection = f1_extra_keys & f2_extra_keys
  28. score_1 = len(keys_intersection) / len(keys_union)
  29. score_2 = len(search_keys_intersection) / len(search_keys_union)
  30. score_3 = len(extra_keys_intersection) / len(extra_keys_union)
  31. return score_1 * 0.4 + score_2 * 0.4 + score_3 * 0.2, d2['video_id']
  32. def title_similarity_rank(content_title, recall_list):
  33. """
  34. :param content_title:
  35. :param recall_list:
  36. :return:
  37. """
  38. include_title_list = []
  39. for item in recall_list:
  40. video_info = item['result']
  41. platform = item['platform']
  42. if platform in ['dy_search', 'baidu_search']:
  43. title = video_info['title']
  44. elif platform in ['xg_search']:
  45. title = video_info['video_title']
  46. else:
  47. continue
  48. item['title'] = title
  49. item['score'] = jcd_title_similarity(content_title, title)
  50. include_title_list.append(item)
  51. sorted_list = sorted(include_title_list, key=lambda x: x['score'], reverse=True)
  52. return sorted_list
  53. async def title_similarity_with_nlp(content_title, recall_list) -> Dict:
  54. """
  55. 通过相关性模型来计算文章标题和搜索标题之间的相关性
  56. """
  57. title_list = [i['result']['title'] for i in recall_list]
  58. score_list = await nlp_title_similarity(
  59. url=nlp_base_url,
  60. ori_title=content_title,
  61. search_title_list=title_list
  62. )
  63. # if local machine is down, use aliyun machine
  64. if not score_list:
  65. score_list = await nlp_title_similarity(
  66. url=nlp_aliyun_url,
  67. ori_title=content_title,
  68. search_title_list=title_list
  69. )
  70. # check whether score_list exist
  71. if score_list:
  72. sorted_list = sorted(
  73. (
  74. {**item, 'score': score}
  75. for item, score in zip(recall_list, score_list)
  76. ),
  77. key=lambda x: x['score'],
  78. reverse=True
  79. )
  80. response = {
  81. "alg": "nlp",
  82. "result": sorted_list
  83. }
  84. else:
  85. response = {
  86. "result": empty_list
  87. }
  88. return response