# coding: utf-8 import pandas as pd import time begin_time = time.time() #df = pd.read_csv("./datas/item2vecTzld1106.csv") #df = pd.read_csv("/root/xielixun/item2vec_app_20201126.csv") #df = pd.read_csv("/root/xielixun/item2vec_app_20201217.csv") #df = pd.read_csv("/root/xielixun/item2vec_app_20201217.csv") #df = pd.read_csv("/work/xielixun/user_action_feature_item2vec_app20210221.csv") #df = pd.read_csv("/work/xielixun/user_action_feature_item2vec_app20210406.csv") df = pd.read_csv("/work/xielixun/user_action_feature_item2vec_app20210419.csv") from pyspark.sql import functions as F from pyspark.sql import types as T #df_embedding = pd.read_csv("./datas/tzld_video_embedding-1106-sort.csv") #df_embedding = pd.read_csv("./datas/tzld_video_embedding-1126-sort.csv") #df_embedding = pd.read_csv("./datas/tzld_video_embedding-1202-sort.csv") #df_embedding = pd.read_csv("./datas/tzld_video_embedding-1217-sort.csv") #df_embedding = pd.read_csv("./datas/tzld_video_embedding-210222-sort.csv") #df_embedding = pd.read_csv("/work/xielixun/item2vec-java/tzld_video_embedding-210406-sort.csv") df_embedding = pd.read_csv("/work/xielixun/item2vec-java/tzld_video_embedding-210419-sort.csv") df_embedding.head() df_merge_user = pd.merge(left=df, right=df_embedding, left_on="videoId", right_on="word") df_merge_user.head(10) import numpy as np import json df_merge_user["vector"] = df_merge_user["vector"].map(lambda x : np.array(json.loads(x))) print("df_merge_user[vector] is ") print(df_merge_user["vector"]) df_group_sum = df_merge_user.groupby(['userId'])['vector'].apply(np.mean).reset_index() print("df_group_sum") df_group_sum.head(10) #df_group_sum.to_csv("./datas/tzld_videoids_users_embedding3.csv", index=False) #df_group_sum.to_csv("./datas/tzld_videoids_users_embedding-1126-app.csv", index=False) #df_group_sum.to_csv("./datas/tzld_videoids_users_embedding-1202-app.csv", index=False) #df_group_sum.to_csv("./datas/tzld_videoids_users_embedding-1217-app.csv", index=False) #df_group_sum.to_csv("./datas/tzld_videoids_users_embedding-210222-app.csv", index=False) #df_group_sum.to_csv("/work/xielixun/item2vec-java/tzld_videoids_users_embedding-210406-app.csv", index=False) df_group_sum.to_csv("/work/xielixun/item2vec-java/tzld_videoids_users_embedding-210419-app.csv", index=False) print("train cost time is: " + str(time.time() - begin_time))