1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253 |
- # coding: utf-8
- import pandas as pd
- import time
- begin_time = time.time()
- #df = pd.read_csv("./datas/item2vecTzld1106.csv")
- #df = pd.read_csv("/root/xielixun/item2vec_app_20201126.csv")
- #df = pd.read_csv("/root/xielixun/item2vec_app_20201217.csv")
- #df = pd.read_csv("/root/xielixun/item2vec_app_20201217.csv")
- #df = pd.read_csv("/work/xielixun/user_action_feature_item2vec_app20210221.csv")
- #df = pd.read_csv("/work/xielixun/user_action_feature_item2vec_app20210406.csv")
- df = pd.read_csv("/work/xielixun/user_action_feature_item2vec_app20210419.csv")
- from pyspark.sql import functions as F
- from pyspark.sql import types as T
- #df_embedding = pd.read_csv("./datas/tzld_video_embedding-1106-sort.csv")
- #df_embedding = pd.read_csv("./datas/tzld_video_embedding-1126-sort.csv")
- #df_embedding = pd.read_csv("./datas/tzld_video_embedding-1202-sort.csv")
- #df_embedding = pd.read_csv("./datas/tzld_video_embedding-1217-sort.csv")
- #df_embedding = pd.read_csv("./datas/tzld_video_embedding-210222-sort.csv")
- #df_embedding = pd.read_csv("/work/xielixun/item2vec-java/tzld_video_embedding-210406-sort.csv")
- df_embedding = pd.read_csv("/work/xielixun/item2vec-java/tzld_video_embedding-210419-sort.csv")
- df_embedding.head()
- df_merge_user = pd.merge(left=df,
- right=df_embedding,
- left_on="videoId",
- right_on="word")
- df_merge_user.head(10)
- import numpy as np
- import json
- df_merge_user["vector"] = df_merge_user["vector"].map(lambda x : np.array(json.loads(x)))
- print("df_merge_user[vector] is ")
- print(df_merge_user["vector"])
- df_group_sum = df_merge_user.groupby(['userId'])['vector'].apply(np.mean).reset_index()
- print("df_group_sum")
- df_group_sum.head(10)
- #df_group_sum.to_csv("./datas/tzld_videoids_users_embedding3.csv", index=False)
- #df_group_sum.to_csv("./datas/tzld_videoids_users_embedding-1126-app.csv", index=False)
- #df_group_sum.to_csv("./datas/tzld_videoids_users_embedding-1202-app.csv", index=False)
- #df_group_sum.to_csv("./datas/tzld_videoids_users_embedding-1217-app.csv", index=False)
- #df_group_sum.to_csv("./datas/tzld_videoids_users_embedding-210222-app.csv", index=False)
- #df_group_sum.to_csv("/work/xielixun/item2vec-java/tzld_videoids_users_embedding-210406-app.csv", index=False)
- df_group_sum.to_csv("/work/xielixun/item2vec-java/tzld_videoids_users_embedding-210419-app.csv", index=False)
- print("train cost time is: " + str(time.time() - begin_time))
|