item2Vec-userEmbedding-tzld.py 2.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253
  1. # coding: utf-8
  2. import pandas as pd
  3. import time
  4. begin_time = time.time()
  5. #df = pd.read_csv("./datas/item2vecTzld1106.csv")
  6. #df = pd.read_csv("/root/xielixun/item2vec_app_20201126.csv")
  7. #df = pd.read_csv("/root/xielixun/item2vec_app_20201217.csv")
  8. #df = pd.read_csv("/root/xielixun/item2vec_app_20201217.csv")
  9. #df = pd.read_csv("/work/xielixun/user_action_feature_item2vec_app20210221.csv")
  10. #df = pd.read_csv("/work/xielixun/user_action_feature_item2vec_app20210406.csv")
  11. df = pd.read_csv("/work/xielixun/user_action_feature_item2vec_app20210419.csv")
  12. from pyspark.sql import functions as F
  13. from pyspark.sql import types as T
  14. #df_embedding = pd.read_csv("./datas/tzld_video_embedding-1106-sort.csv")
  15. #df_embedding = pd.read_csv("./datas/tzld_video_embedding-1126-sort.csv")
  16. #df_embedding = pd.read_csv("./datas/tzld_video_embedding-1202-sort.csv")
  17. #df_embedding = pd.read_csv("./datas/tzld_video_embedding-1217-sort.csv")
  18. #df_embedding = pd.read_csv("./datas/tzld_video_embedding-210222-sort.csv")
  19. #df_embedding = pd.read_csv("/work/xielixun/item2vec-java/tzld_video_embedding-210406-sort.csv")
  20. df_embedding = pd.read_csv("/work/xielixun/item2vec-java/tzld_video_embedding-210419-sort.csv")
  21. df_embedding.head()
  22. df_merge_user = pd.merge(left=df,
  23. right=df_embedding,
  24. left_on="videoId",
  25. right_on="word")
  26. df_merge_user.head(10)
  27. import numpy as np
  28. import json
  29. df_merge_user["vector"] = df_merge_user["vector"].map(lambda x : np.array(json.loads(x)))
  30. print("df_merge_user[vector] is ")
  31. print(df_merge_user["vector"])
  32. df_group_sum = df_merge_user.groupby(['userId'])['vector'].apply(np.mean).reset_index()
  33. print("df_group_sum")
  34. df_group_sum.head(10)
  35. #df_group_sum.to_csv("./datas/tzld_videoids_users_embedding3.csv", index=False)
  36. #df_group_sum.to_csv("./datas/tzld_videoids_users_embedding-1126-app.csv", index=False)
  37. #df_group_sum.to_csv("./datas/tzld_videoids_users_embedding-1202-app.csv", index=False)
  38. #df_group_sum.to_csv("./datas/tzld_videoids_users_embedding-1217-app.csv", index=False)
  39. #df_group_sum.to_csv("./datas/tzld_videoids_users_embedding-210222-app.csv", index=False)
  40. #df_group_sum.to_csv("/work/xielixun/item2vec-java/tzld_videoids_users_embedding-210406-app.csv", index=False)
  41. df_group_sum.to_csv("/work/xielixun/item2vec-java/tzld_videoids_users_embedding-210419-app.csv", index=False)
  42. print("train cost time is: " + str(time.time() - begin_time))