rank.py 2.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869
  1. """
  2. @author: luojunhui
  3. """
  4. from applications.match_algorithm.title_similarity import jcd_title_similarity
  5. from applications.functions.log import logging
  6. def jac_score(d1, d2):
  7. """
  8. 通过交并集来判断
  9. :param d1:
  10. :param d2:
  11. :return:
  12. """
  13. f1_keys = set(d1["key_words"])
  14. f2_keys = set(d2["key_words"])
  15. keys_union = f1_keys | f2_keys
  16. keys_intersection = f1_keys & f2_keys
  17. f1_search_keys = set(d1["search_keys"])
  18. f2_search_keys = set(d2["search_keys"])
  19. search_keys_union = f1_search_keys | f2_search_keys
  20. search_keys_intersection = f1_search_keys & f2_search_keys
  21. f1_extra_keys = set(d1["extra_keys"])
  22. f2_extra_keys = set(d2["extra_keys"])
  23. extra_keys_union = f1_extra_keys | f2_extra_keys
  24. extra_keys_intersection = f1_extra_keys & f2_extra_keys
  25. score_1 = len(keys_intersection) / len(keys_union)
  26. score_2 = len(search_keys_intersection) / len(search_keys_union)
  27. score_3 = len(extra_keys_intersection) / len(extra_keys_union)
  28. return score_1 * 0.4 + score_2 * 0.4 + score_3 * 0.2, d2['video_id']
  29. def title_similarity_rank(content_title, recall_list):
  30. """
  31. :param content_title:
  32. :param recall_list:
  33. :return:
  34. """
  35. print("ori_title", content_title)
  36. print("unsorted title list")
  37. include_title_list = []
  38. for item in recall_list:
  39. video_info = item['result']
  40. platform = item['platform']
  41. if platform in ['dy_search', 'baidu_search']:
  42. title = video_info['title']
  43. elif platform in ['xg_search']:
  44. title = video_info['video_title']
  45. else:
  46. continue
  47. item['title'] = title
  48. print(title)
  49. item['score'] = jcd_title_similarity(content_title, title)
  50. include_title_list.append(item)
  51. # # include_title_list加上相似度分
  52. # title_score_list = [
  53. # {
  54. # 'score': jcd_title_similarity(
  55. # content_title,
  56. # item['title']
  57. # ),
  58. # **item
  59. # }
  60. # for item in
  61. # include_title_list
  62. # ]
  63. sorted_list = sorted(include_title_list, key=lambda x: x['score'], reverse=True)
  64. return sorted_list