videos_similarity.py 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155
  1. import jieba
  2. import numpy as np
  3. import datetime
  4. import pandas as pd
  5. from odps import ODPS
  6. from my_utils import filter_video_status
  7. from db_helper import RedisHelper
  8. from my_config import set_config
  9. from log import Log
  10. config_, _ = set_config()
  11. log_ = Log()
  12. def get_data_from_odps(project, sql):
  13. odps = ODPS(
  14. access_id=config_.ODPS_CONFIG['ACCESSID'],
  15. secret_access_key=config_.ODPS_CONFIG['ACCESSKEY'],
  16. project=project,
  17. endpoint=config_.ODPS_CONFIG['ENDPOINT'],
  18. connect_timeout=3000,
  19. read_timeout=500000,
  20. pool_maxsize=1000,
  21. pool_connections=1000
  22. )
  23. try:
  24. with odps.execute_sql(sql=sql).open_reader() as reader:
  25. data_df = reader.to_pandas()
  26. except Exception as e:
  27. data_df = None
  28. return data_df
  29. def get_word_vector(s1, s2):
  30. """
  31. :param s1: 句子1
  32. :param s2: 句子2
  33. :return: 返回句子的余弦相似度
  34. """
  35. # 分词
  36. cut1 = jieba.lcut(s1, cut_all=False)
  37. cut2 = jieba.lcut(s2, cut_all=False)
  38. list_word1 = (','.join(cut1)).split(',')
  39. list_word2 = (','.join(cut2)).split(',')
  40. # 列出所有的词,取并集
  41. key_word = list(set(list_word1 + list_word2))
  42. # print(key_word)
  43. # 给定形状和类型的用0填充的矩阵存储向量
  44. word_vector1 = np.zeros(len(key_word))
  45. word_vector2 = np.zeros(len(key_word))
  46. # 计算词频
  47. # 依次确定向量的每个位置的值
  48. for i in range(len(key_word)):
  49. # 遍历key_word中每个词在句子中的出现次数
  50. for j in range(len(list_word1)):
  51. if key_word[i] == list_word1[j]:
  52. word_vector1[i] += 1
  53. for k in range(len(list_word2)):
  54. if key_word[i] == list_word2[k]:
  55. word_vector2[i] += 1
  56. # 输出向量
  57. # print(word_vector1)
  58. # print(word_vector2)
  59. return word_vector1, word_vector2
  60. def cos_dist(vec1, vec2):
  61. """
  62. :param vec1: 向量1
  63. :param vec2: 向量2
  64. :return: 返回两个向量的余弦相似度
  65. """
  66. dist1 = float(np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2)))
  67. return dist1
  68. def get_movie_video_top_list():
  69. sql = "select videoid, title from videoods.movie_store_video_top_list where returns > 5;"
  70. data_df = get_data_from_odps(project='videoods', sql=sql)
  71. data_df = data_df.fillna(0)
  72. data_df['videoid'] = data_df['videoid'].astype(int)
  73. movie_videos = dict()
  74. for index, row in data_df.iterrows():
  75. if row['videoid'] == 0:
  76. continue
  77. # if index > 20:
  78. # break
  79. movie_videos[int(row['videoid'])] = row['title']
  80. return movie_videos
  81. def get_sim_videos():
  82. now = datetime.datetime.now()
  83. log_.info(f"now = {datetime.datetime.strftime(now, '%Y-%m-%d %H:%M:%S')}")
  84. sql_create_time = datetime.datetime.strftime(now - datetime.timedelta(days=30), '%Y-%m-%d %H:%M:%S')
  85. if sql_create_time < '2022-04-22 16:40:00':
  86. sql_create_time = '2022-04-22 16:40:00'
  87. sql = f"SELECT video_id, create_time, title FROM videoods.movie_store_video_allow_list_final " \
  88. f"WHERE create_time>='{sql_create_time}';"
  89. data_df = get_data_from_odps(project='videoods', sql=sql)
  90. video_ids = [int(video_id) for video_id in data_df['video_id'].to_list()]
  91. # 对视频状态进行过滤
  92. filtered_videos = filter_video_status(list(video_ids))
  93. sim_videos = dict()
  94. for index, row in data_df.iterrows():
  95. video_id = int(row['video_id'])
  96. if video_id in filtered_videos:
  97. sim_videos[video_id] = row['title']
  98. return sim_videos
  99. def similarity_rank(movie_videos, sim_videos):
  100. redis_helper = RedisHelper()
  101. sim_result = []
  102. if len(movie_videos) == 0 or len(sim_videos) == 0:
  103. return
  104. for video_id, title in movie_videos.items():
  105. item_sim_list = []
  106. for vid, title1 in sim_videos.items():
  107. if vid == video_id:
  108. continue
  109. vec1, vec2 = get_word_vector(title, title1)
  110. dist = cos_dist(vec1, vec2)
  111. if dist > 0:
  112. # item_sim[vid] = dist
  113. item_sim = {'top_video_id': video_id, 'title': title, 'vid': vid, 'title1': title1, 'dist': dist}
  114. item_sim_list.append(item_sim)
  115. item_sim_list.sort(key=lambda x: x['dist'], reverse=True)
  116. sim_result.extend(item_sim_list[:config_.SIM_N_19])
  117. # to_redis
  118. key_name = f"{config_.MOVIE_RELEVANT_LIST_KEY_NAME_PREFIX}{video_id}"
  119. relevant_data = dict()
  120. for item in item_sim_list[:config_.SIM_N_19]:
  121. relevant_data[item['vid']] = item['dist']
  122. if redis_helper.key_exists(key_name=key_name):
  123. redis_helper.del_keys(key_name=key_name)
  124. if relevant_data:
  125. print(video_id)
  126. redis_helper.add_data_with_zset(key_name=key_name, data=relevant_data, expire_time=10*60)
  127. dist_df = pd.DataFrame(sim_result, columns=['top_video_id', 'title', 'vid', 'title1', 'dist'])
  128. dist_df.to_csv('./data/videos_dist.csv', index=False)
  129. if __name__ == '__main__':
  130. movie_videos = get_movie_video_top_list()
  131. sim_videos = get_sim_videos()
  132. log_.info(f"movie_videos count = {len(movie_videos)}, sim_videos count = {len(sim_videos)}")
  133. similarity_rank(movie_videos=movie_videos, sim_videos=sim_videos)