|
@@ -5,6 +5,7 @@ import datetime
|
|
|
import pandas as pd
|
|
|
from odps import ODPS
|
|
|
from utils import filter_video_status
|
|
|
+from db_helper import RedisHelper
|
|
|
from config import set_config
|
|
|
from log import Log
|
|
|
|
|
@@ -79,7 +80,7 @@ def cos_dist(vec1, vec2):
|
|
|
|
|
|
|
|
|
def get_movie_video_top_list():
|
|
|
- sql = "select videoid, title from videoods.movie_store_video_top_list;"
|
|
|
+ sql = "select videoid, title from videoods.movie_store_video_top_list where returns > 5;"
|
|
|
data_df = get_data_from_odps(project='videoods', sql=sql)
|
|
|
data_df = data_df.fillna(0)
|
|
|
data_df['videoid'] = data_df['videoid'].astype(int)
|
|
@@ -114,6 +115,7 @@ def get_sim_videos():
|
|
|
|
|
|
|
|
|
def similarity_rank(movie_videos, sim_videos):
|
|
|
+ redis_helper = RedisHelper()
|
|
|
sim_result = []
|
|
|
for video_id, title in movie_videos.items():
|
|
|
item_sim_list = []
|
|
@@ -128,22 +130,22 @@ def similarity_rank(movie_videos, sim_videos):
|
|
|
item_sim_list.append(item_sim)
|
|
|
item_sim_list.sort(key=lambda x: x['dist'], reverse=True)
|
|
|
sim_result.extend(item_sim_list[:4])
|
|
|
- # sim_result.append(item_sim)
|
|
|
+ # to_redis
|
|
|
+ key_name = f"{config_.MOVIE_RELEVANT_LIST_KEY_NAME_PREFIX}{video_id}"
|
|
|
+ relevant_data = dict()
|
|
|
+ for item in item_sim_list:
|
|
|
+ relevant_data[item['vid']] = item['dist']
|
|
|
+ if redis_helper.key_exists(key_name=key_name):
|
|
|
+ redis_helper.del_keys(key_name=key_name)
|
|
|
+ redis_helper.add_data_with_zset(key_name=key_name, data=relevant_data, expire_time=24*3600)
|
|
|
+
|
|
|
dist_df = pd.DataFrame(sim_result, columns=['top_video_id', 'title', 'vid', 'title1', 'dist'])
|
|
|
dist_df.to_csv('./data/videos_dist.csv', index=False)
|
|
|
- # sim_result[video_id] = item_sim
|
|
|
- # print(video_id, item_sim)
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
- # str_list = ['S手的生活.2020', '花X道Z', '肉Y不能.法语中字', '窃YU无罪']
|
|
|
- # s1 = "杀手的生活"
|
|
|
- # for s2 in str_list:
|
|
|
- # vec1, vec2 = get_word_vector(s1, s2)
|
|
|
- # dist1 = cos_dist(vec1, vec2)
|
|
|
- # print(dist1)
|
|
|
movie_videos = get_movie_video_top_list()
|
|
|
sim_videos = get_sim_videos()
|
|
|
- print(len(movie_videos), len(sim_videos))
|
|
|
+ log_.info(f"movie_videos count = {len(movie_videos)}, sim_videos count = {len(sim_videos)}")
|
|
|
similarity_rank(movie_videos=movie_videos, sim_videos=sim_videos)
|
|
|
|