rank.py 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134
  1. """
  2. @author: luojunhui
  3. """
  4. from datetime import datetime, timedelta
  5. from applications.match_algorithm.title_similarity import jcd_title_similarity
  6. def jac_score(d1, d2):
  7. """
  8. 通过交并集来判断
  9. :param d1:
  10. :param d2:
  11. :return:
  12. """
  13. f1_keys = set(d1["key_words"])
  14. f2_keys = set(d2["key_words"])
  15. keys_union = f1_keys | f2_keys
  16. keys_intersection = f1_keys & f2_keys
  17. f1_search_keys = set(d1["search_keys"])
  18. f2_search_keys = set(d2["search_keys"])
  19. search_keys_union = f1_search_keys | f2_search_keys
  20. search_keys_intersection = f1_search_keys & f2_search_keys
  21. f1_extra_keys = set(d1["extra_keys"])
  22. f2_extra_keys = set(d2["extra_keys"])
  23. extra_keys_union = f1_extra_keys | f2_extra_keys
  24. extra_keys_intersection = f1_extra_keys & f2_extra_keys
  25. score_1 = len(keys_intersection) / len(keys_union)
  26. score_2 = len(search_keys_intersection) / len(search_keys_union)
  27. score_3 = len(extra_keys_intersection) / len(extra_keys_union)
  28. return score_1 * 0.4 + score_2 * 0.4 + score_3 * 0.2, d2['video_id']
  29. def title_similarity_rank(content_title, recall_list):
  30. """
  31. :param content_title:
  32. :param recall_list:
  33. :return:
  34. """
  35. include_title_list = []
  36. for item in recall_list:
  37. video_info = item['result']
  38. platform = item['platform']
  39. if platform in ['dy_search', 'baidu_search']:
  40. title = video_info['title']
  41. elif platform in ['xg_search']:
  42. title = video_info['video_title']
  43. else:
  44. continue
  45. item['title'] = title
  46. item['score'] = jcd_title_similarity(content_title, title)
  47. include_title_list.append(item)
  48. sorted_list = sorted(include_title_list, key=lambda x: x['score'], reverse=True)
  49. return sorted_list
  50. async def get_content_oss_fission_dict(db_client, config, content_id) -> dict[str: float]:
  51. """
  52. 通过 content_id 对应的 oss 路径对应的裂变表现进行排序
  53. oss 数据每天凌晨 2 点更新
  54. :return:
  55. """
  56. FISSION_DETAIL_TABLE = config.fission_detail_table
  57. two_days_ago_dt = (datetime.now() - timedelta(days=2)).strftime('%Y%m%d')
  58. sql = f"""
  59. SELECT
  60. oss_name, fission_rate_0, fission_0_on_read
  61. FROM
  62. {FISSION_DETAIL_TABLE}
  63. WHERE content_id = '{content_id}' and dt >= '{two_days_ago_dt}'
  64. ORDER BY dt DESC;
  65. """
  66. result = await db_client.async_select(sql)
  67. fission_info_dict = {}
  68. if result:
  69. for item in result:
  70. key = item[0]
  71. value = {
  72. "fission_rate_0": item[1],
  73. "fission_0_on_read": item[2]
  74. }
  75. if fission_info_dict.get(key):
  76. continue
  77. else:
  78. fission_info_dict[key] = value
  79. return fission_info_dict
  80. else:
  81. return {}
  82. async def get_title_oss_fission_list(db_client, config, content_id) -> list[dict]:
  83. """
  84. 通过 content_id 对应的 oss 路径对应的裂变表现进行排序
  85. oss 数据每天凌晨 2 点更新
  86. todo: 获取有数据的最新dt
  87. :return:
  88. """
  89. FISSION_DETAIL_TABLE = config.fission_detail_table
  90. LONG_ARTICLES_TEXT_TABLE = config.article_text_table
  91. LONG_ARTICLES_CRAWLER_TABLE = config.article_crawler_video_table
  92. two_days_ago_dt = (datetime.now() - timedelta(days=2)).strftime('%Y%m%d')
  93. sql = f"""
  94. SELECT
  95. lavfi.oss_name, lavfi.fission_0_on_read, lacv.platform, lacv.cover_oss_path, lacv.user_id
  96. FROM
  97. {FISSION_DETAIL_TABLE} lavfi
  98. JOIN {LONG_ARTICLES_CRAWLER_TABLE} lacv on lavfi.oss_name = lacv.video_oss_path
  99. WHERE title = (
  100. SELECT article_title
  101. FROM {LONG_ARTICLES_TEXT_TABLE}
  102. WHERE content_id = '{content_id}'
  103. );
  104. AND lavfi.dt = (
  105. SELECT MAX(dt)
  106. FROM long_articles_videos_fission_info
  107. WHERE oss_name = lavfi.oss_name
  108. )
  109. """
  110. result = await db_client.async_select(sql)
  111. fission_info_list = []
  112. if result:
  113. for item in result:
  114. obj = {
  115. "platform": item[2],
  116. "video_oss_path": item[0],
  117. "cover_oss_path": item[3],
  118. "uid": item[4],
  119. "fission_0_on_read": item[1],
  120. }
  121. fission_info_list.append(obj)
  122. return sorted(fission_info_list, key=lambda x: x['fission_0_on_read'], reverse=True)
  123. else:
  124. return []