preprocess_tzld210322_gen.py 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247
  1. import random
  2. import gc
  3. import numpy as np
  4. import pandas as pd
  5. from tqdm import tqdm
  6. from tensorflow.python.keras.preprocessing.sequence import pad_sequences
  7. import time
  8. import threadpool
  9. from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED, FIRST_COMPLETED, as_completed
  10. import multiprocessing
  11. def gen_data_set_bak(data, negsample=0):
  12. data.sort_values("logtimestamp", inplace=True)
  13. label_list = list(data["label"])
  14. label_list_b = []
  15. item_ids = data['videoid'].unique()
  16. print(item_ids)
  17. user_ids = data['mid'].unique()
  18. print("\n\nuser size is: ", len(user_ids))
  19. print("********* pre data len is: " + str(len(list(data["label"]))))
  20. print(data["label"].mean())
  21. data = data[data["label"] > 0].copy()
  22. user_ids = data['mid'].unique()
  23. train_set = []
  24. test_set = []
  25. print("pre data len is: " + str(len(list(data["label"]))))
  26. for reviewerID, hist in tqdm(data.groupby('mid')):
  27. print("\n\nreviewerID : ", reviewerID)
  28. print("\n\nhist : \n", hist)
  29. # pos_list = hist['movie_id'].tolist()
  30. pos_list = hist['videoid'].tolist()
  31. rating_list = hist['rating'].tolist()
  32. if negsample > 0:
  33. candidate_set = list(set(item_ids) - set(pos_list))
  34. neg_list = np.random.choice(candidate_set, size=len(pos_list)*negsample,replace=True)
  35. for i in range(1, len(pos_list)):
  36. hist = pos_list[:i]
  37. if i != len(pos_list) - 1:
  38. train_set.append((reviewerID, hist[::-1], pos_list[i], 1, len(hist[::-1]),rating_list[i]))
  39. # print("hist[::-1] is: ")
  40. # print(hist[::-1])
  41. for negi in range(negsample):
  42. train_set.append((reviewerID, hist[::-1], neg_list[i*negsample+negi], 0,len(hist[::-1])))
  43. else:
  44. test_set.append((reviewerID, hist[::-1], pos_list[i],1,len(hist[::-1]),rating_list[i]))
  45. random.shuffle(train_set)
  46. random.shuffle(test_set)
  47. print(len(train_set[0]), len(test_set[0]))
  48. print(len(train_set[0]))
  49. print(len(train_set), len(test_set))
  50. return train_set, test_set
  51. all_task = list()
  52. # pool = multiprocessing.Pool(processes=3)
  53. def thread_func1(train_set, test_set, negsample, item_ids, reviewerID, hist):
  54. begin_time = time.time()
  55. print("\n\nreviewerID : ", reviewerID)
  56. pos_list = hist['videoid'].tolist()
  57. rating_list = hist['rating'].tolist()
  58. if negsample > 0:
  59. candidate_set = list(set(item_ids) - set(pos_list))
  60. neg_list = np.random.choice(candidate_set, size=len(pos_list) * negsample, replace=True)
  61. for i in range(1, len(pos_list)):
  62. hist = pos_list[:i]
  63. if i != len(pos_list) - 1:
  64. train_set.append((reviewerID, hist[::-1], pos_list[i], 1, len(hist[::-1]), rating_list[i]))
  65. # print("hist[::-1] is: ")
  66. # print(hist[::-1])
  67. for negi in range(negsample):
  68. train_set.append((reviewerID, hist[::-1], neg_list[i * negsample + negi], 0, len(hist[::-1])))
  69. else:
  70. test_set.append((reviewerID, hist[::-1], pos_list[i], 1, len(hist[::-1]), rating_list[i]))
  71. print(str(reviewerID) + " idx process func cost time: " + str(time.time() - begin_time))
  72. return train_set, test_set
  73. def thread_func2(negsample, item_ids, reviewerID, hist):
  74. begin_time = time.time()
  75. train_set = []
  76. test_set = []
  77. print("\n\nreviewerID : ", reviewerID)
  78. pos_list = hist['videoid'].tolist()
  79. rating_list = hist['rating'].tolist()
  80. if negsample > 0:
  81. candidate_set = list(set(item_ids) - set(pos_list))
  82. neg_list = np.random.choice(candidate_set, size=len(pos_list) * negsample, replace=True)
  83. for i in range(1, len(pos_list)):
  84. hist = pos_list[:i]
  85. print("hist[] is: ")
  86. print(hist)
  87. if i != len(pos_list) - 1:
  88. train_set.append((reviewerID, hist[::-1], pos_list[i], 1, len(hist[::-1]), rating_list[i]))
  89. print("hist[::-1] is: ")
  90. print(hist[::-1])
  91. for negi in range(negsample):
  92. train_set.append((reviewerID, hist[::-1], neg_list[i * negsample + negi], 0, len(hist[::-1])))
  93. else:
  94. test_set.append((reviewerID, hist[::-1], pos_list[i], 1, len(hist[::-1]), rating_list[i]))
  95. print(str(reviewerID) + " idx process func cost time: " + str(time.time() - begin_time))
  96. return train_set, test_set
  97. def gen_data_set(train_path="", test_path="", user_mid_uid=None):
  98. train_set2 = pd.read_csv(train_path)
  99. test_set2 = pd.read_csv(test_path)
  100. train_set2 = train_set2[train_set2["mid"] != "unknown"].copy()
  101. train_pd = pd.merge(left=train_set2,
  102. right=user_mid_uid,
  103. left_on="mid",
  104. right_on="uidx",
  105. # how="left")
  106. how="inner")
  107. # train_pd = train_pd.sample(frac=0.2)
  108. test_pd = pd.merge(left=test_set2,
  109. right=user_mid_uid,
  110. left_on="mid",
  111. right_on="uidx",
  112. # how="left")
  113. how="inner")
  114. #train_pd = train_pd.sample(frac=0.3)
  115. train_pd = train_pd.sample(frac=0.6)
  116. print("after sample train set len is: ")
  117. print(len(train_pd), len(test_pd))
  118. return train_pd, test_pd
  119. def gen_data_set_sdm(data, seq_short_len=5, seq_prefer_len=50):
  120. data.sort_values("timestamp", inplace=True)
  121. train_set = []
  122. test_set = []
  123. for reviewerID, hist in tqdm(data.groupby('user_id')):
  124. pos_list = hist['movie_id'].tolist()
  125. genres_list = hist['genres'].tolist()
  126. rating_list = hist['rating'].tolist()
  127. for i in range(1, len(pos_list)):
  128. hist = pos_list[:i]
  129. genres_hist = genres_list[:i]
  130. if i <= seq_short_len and i != len(pos_list) - 1:
  131. train_set.append((reviewerID, hist[::-1], [0]*seq_prefer_len, pos_list[i], 1, len(hist[::-1]), 0,
  132. rating_list[i], genres_hist[::-1], [0]*seq_prefer_len))
  133. elif i != len(pos_list) - 1:
  134. train_set.append((reviewerID, hist[::-1][:seq_short_len], hist[::-1][seq_short_len:], pos_list[i], 1, seq_short_len,
  135. len(hist[::-1])-seq_short_len, rating_list[i], genres_hist[::-1][:seq_short_len], genres_hist[::-1][seq_short_len:]))
  136. elif i <= seq_short_len and i == len(pos_list) - 1:
  137. test_set.append((reviewerID, hist[::-1], [0] * seq_prefer_len, pos_list[i], 1, len(hist[::-1]), 0,
  138. rating_list[i], genres_hist[::-1], [0]*seq_prefer_len))
  139. else:
  140. test_set.append((reviewerID, hist[::-1][:seq_short_len], hist[::-1][seq_short_len:], pos_list[i], 1, seq_short_len,
  141. len(hist[::-1])-seq_short_len, rating_list[i], genres_hist[::-1][:seq_short_len], genres_hist[::-1][seq_short_len:]))
  142. random.shuffle(train_set)
  143. random.shuffle(test_set)
  144. print(len(train_set[0]), len(test_set[0]))
  145. return train_set, test_set
  146. def split(x):
  147. key_ans = x.split(' ')
  148. return list(map(int, key_ans))
  149. def gen_model_input(train_set, user_profile=None, item_profile=None, seq_max_len=50):
  150. train_uid = np.array(list(train_set["userid"].values))
  151. train_seq = list(map(split, train_set["hist_video_list"].values))
  152. train_iid = np.array(list(train_set["pos_video_id"].values))
  153. train_label = np.array(list(train_set["label"].values))
  154. train_hist_len = np.array(list(train_set["hist_list_len"].values))
  155. print("\n\n train_seq")
  156. print(type(train_seq))
  157. # 补充和截断 post从后面,默认pre从前面
  158. train_seq_pad = pad_sequences(train_seq, maxlen=seq_max_len, padding='post', truncating='post', value=0)
  159. print("\n\n train_seq_pad")
  160. print(type(train_seq_pad))
  161. train_model_input = {"mid": train_uid, "videoid": train_iid, "hist_video_id": train_seq_pad,
  162. "hist_len": train_hist_len}
  163. for key in ["userRatedVideo1", "userGenre1", "userCity", "userRealplayCount",
  164. "videoGenre1", "authorid", "videoRealPlayCount", "videoDuration"]:
  165. try:
  166. train_model_input[key] = user_profile.loc[train_model_input['mid']][key].values
  167. except Exception as ex:
  168. print("\n\n gen_model_input exception ", ex)
  169. return None, None
  170. return train_model_input, train_label
  171. def gen_model_input_user_emb(train_set, user_profile=None, item_profile=None, seq_max_len=50):
  172. train_uid = np.array(list(train_set["userid"].values))
  173. train_seq = list(map(split, train_set["hist_video_list"].values))
  174. train_iid = np.array(list(train_set["pos_video_id"].values))
  175. train_label = np.array(list(train_set["label"].values))
  176. train_hist_len = np.array(list(train_set["hist_list_len"].values))
  177. # 补充和截断 post从后面,默认pre从前面
  178. train_seq_pad = pad_sequences(train_seq, maxlen=seq_max_len, padding='post', truncating='post', value=0)
  179. train_model_input = {"mid": train_uid, "videoid": train_iid, "hist_video_id": train_seq_pad,
  180. "hist_len": train_hist_len}
  181. for key in ["userRatedVideo1", "userGenre1", "userCity", "userRealplayCount"]:
  182. try:
  183. train_model_input[key] = user_profile.loc[train_model_input['mid']][key].values
  184. except Exception as ex:
  185. print("\n\n gen_model_input_user_emb exception ", ex)
  186. return None, None
  187. return train_model_input, train_label