preprocess_tzld210423_gen.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317
  1. import random
  2. import gc
  3. import numpy as np
  4. import pandas as pd
  5. from tqdm import tqdm
  6. from tensorflow.python.keras.preprocessing.sequence import pad_sequences
  7. import time
  8. import threadpool
  9. from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED, FIRST_COMPLETED, as_completed
  10. import multiprocessing
  11. def gen_data_set_bak(data, negsample=0):
  12. # data.sort_values("timestamp", inplace=True)
  13. data.sort_values("logtimestamp", inplace=True)
  14. label_list = list(data["label"])
  15. label_list_b = []
  16. item_ids = data['videoid'].unique()
  17. print(item_ids)
  18. user_ids = data['mid'].unique()
  19. print("\n\nuser size is: ", len(user_ids))
  20. print("********* pre data len is: " + str(len(list(data["label"]))))
  21. print(data["label"].mean())
  22. data = data[data["label"] > 0].copy()
  23. user_ids = data['mid'].unique()
  24. print("******** post user size is: ", len(user_ids))
  25. print("data len is: " + str(len(list(data["label"]))))
  26. train_set = []
  27. test_set = []
  28. print("\n\n ******* data is: ")
  29. # print(data)
  30. # print(list(data.groupby('mid')))
  31. print("pre data len is: " + str(len(list(data["label"]))))
  32. for reviewerID, hist in tqdm(data.groupby('mid')):
  33. print("\n\nreviewerID : ", reviewerID)
  34. print("\n\nhist : \n", hist)
  35. # pos_list = hist['movie_id'].tolist()
  36. pos_list = hist['videoid'].tolist()
  37. rating_list = hist['rating'].tolist()
  38. if negsample > 0:
  39. candidate_set = list(set(item_ids) - set(pos_list))
  40. neg_list = np.random.choice(candidate_set, size=len(pos_list)*negsample,replace=True)
  41. for i in range(1, len(pos_list)):
  42. hist = pos_list[:i]
  43. if i != len(pos_list) - 1:
  44. train_set.append((reviewerID, hist[::-1], pos_list[i], 1, len(hist[::-1]),rating_list[i]))
  45. # print("hist[::-1] is: ")
  46. # print(hist[::-1])
  47. for negi in range(negsample):
  48. train_set.append((reviewerID, hist[::-1], neg_list[i*negsample+negi], 0,len(hist[::-1])))
  49. else:
  50. test_set.append((reviewerID, hist[::-1], pos_list[i],1,len(hist[::-1]),rating_list[i]))
  51. random.shuffle(train_set)
  52. random.shuffle(test_set)
  53. print(len(train_set[0]), len(test_set[0]))
  54. print(len(train_set[0]))
  55. print(len(train_set), len(test_set))
  56. return train_set, test_set
  57. all_task = list()
  58. # pool = multiprocessing.Pool(processes=3)
  59. def thread_func1(train_set, test_set, negsample, item_ids, reviewerID, hist):
  60. begin_time = time.time()
  61. # train_set = []
  62. # test_set = []
  63. print("\n\nreviewerID : ", reviewerID)
  64. # hist = value_index_dict[reviewerID]
  65. # print("\n\nhist : \n", hist)
  66. # pos_list = hist['movie_id'].tolist()
  67. pos_list = hist['videoid'].tolist()
  68. rating_list = hist['rating'].tolist()
  69. if negsample > 0:
  70. candidate_set = list(set(item_ids) - set(pos_list))
  71. neg_list = np.random.choice(candidate_set, size=len(pos_list) * negsample, replace=True)
  72. for i in range(1, len(pos_list)):
  73. hist = pos_list[:i]
  74. if i != len(pos_list) - 1:
  75. train_set.append((reviewerID, hist[::-1], pos_list[i], 1, len(hist[::-1]), rating_list[i]))
  76. # print("hist[::-1] is: ")
  77. # print(hist[::-1])
  78. for negi in range(negsample):
  79. train_set.append((reviewerID, hist[::-1], neg_list[i * negsample + negi], 0, len(hist[::-1])))
  80. else:
  81. test_set.append((reviewerID, hist[::-1], pos_list[i], 1, len(hist[::-1]), rating_list[i]))
  82. print(str(reviewerID) + " idx process func cost time: " + str(time.time() - begin_time))
  83. return train_set, test_set
  84. def thread_func2(negsample, item_ids, reviewerID, hist):
  85. begin_time = time.time()
  86. train_set = []
  87. test_set = []
  88. print("\n\nreviewerID : ", reviewerID)
  89. # hist = value_index_dict[reviewerID]
  90. # print("\n\nhist : \n", hist)
  91. # pos_list = hist['movie_id'].tolist()
  92. pos_list = hist['videoid'].tolist()
  93. print("pos_list is: ")
  94. print(pos_list)
  95. rating_list = hist['rating'].tolist()
  96. if negsample > 0:
  97. candidate_set = list(set(item_ids) - set(pos_list))
  98. neg_list = np.random.choice(candidate_set, size=len(pos_list) * negsample, replace=True)
  99. for i in range(1, len(pos_list)):
  100. hist = pos_list[:i]
  101. print("hist[] is: ")
  102. print(hist)
  103. if i != len(pos_list) - 1:
  104. train_set.append((reviewerID, hist[::-1], pos_list[i], 1, len(hist[::-1]), rating_list[i]))
  105. print("hist[::-1] is: ")
  106. print(hist[::-1])
  107. for negi in range(negsample):
  108. train_set.append((reviewerID, hist[::-1], neg_list[i * negsample + negi], 0, len(hist[::-1])))
  109. else:
  110. test_set.append((reviewerID, hist[::-1], pos_list[i], 1, len(hist[::-1]), rating_list[i]))
  111. print(str(reviewerID) + " idx process func cost time: " + str(time.time() - begin_time))
  112. return train_set, test_set
  113. # def gen_data_set(data, negsample=0, train_path="", test_path="", user_mid_uid=None):
  114. def gen_data_set(train_path="", test_path="", user_mid_uid=None):
  115. # def gen_data_set(train_path="", test_path=""):
  116. # train_set2 = []
  117. # test_set2 = []
  118. train_set2 = pd.read_csv(train_path)
  119. test_set2 = pd.read_csv(test_path)
  120. train_set2 = train_set2[train_set2["mid"] != "unknown"].copy()
  121. test_set2 = test_set2[test_set2["mid"] != "unknown"].copy()
  122. print("train set len is: ")
  123. print(len(train_set2))
  124. print("user mid uid len is: ", len(user_mid_uid))
  125. train_pd = pd.merge(left=train_set2,
  126. right=user_mid_uid,
  127. left_on="mid",
  128. right_on="uidx",
  129. # how="left")
  130. how="inner")
  131. # train_pd = train_pd.sample(frac=0.2)
  132. test_pd = pd.merge(left=test_set2,
  133. right=user_mid_uid,
  134. left_on="mid",
  135. right_on="uidx",
  136. # how="left")
  137. how="inner")
  138. train_pd = train_pd.sample(frac=1.0)
  139. print("after sample train set len is: ")
  140. print(len(train_pd), len(test_pd))
  141. print(len(train_set2), len(test_set2))
  142. del train_set2, test_set2
  143. gc.collect()
  144. return train_pd, test_pd
  145. def gen_data_set_sdm(data, seq_short_len=5, seq_prefer_len=50):
  146. data.sort_values("timestamp", inplace=True)
  147. train_set = []
  148. test_set = []
  149. for reviewerID, hist in tqdm(data.groupby('user_id')):
  150. pos_list = hist['movie_id'].tolist()
  151. genres_list = hist['genres'].tolist()
  152. rating_list = hist['rating'].tolist()
  153. for i in range(1, len(pos_list)):
  154. hist = pos_list[:i]
  155. genres_hist = genres_list[:i]
  156. if i <= seq_short_len and i != len(pos_list) - 1:
  157. train_set.append((reviewerID, hist[::-1], [0]*seq_prefer_len, pos_list[i], 1, len(hist[::-1]), 0,
  158. rating_list[i], genres_hist[::-1], [0]*seq_prefer_len))
  159. elif i != len(pos_list) - 1:
  160. train_set.append((reviewerID, hist[::-1][:seq_short_len], hist[::-1][seq_short_len:], pos_list[i], 1, seq_short_len,
  161. len(hist[::-1])-seq_short_len, rating_list[i], genres_hist[::-1][:seq_short_len], genres_hist[::-1][seq_short_len:]))
  162. elif i <= seq_short_len and i == len(pos_list) - 1:
  163. test_set.append((reviewerID, hist[::-1], [0] * seq_prefer_len, pos_list[i], 1, len(hist[::-1]), 0,
  164. rating_list[i], genres_hist[::-1], [0]*seq_prefer_len))
  165. else:
  166. test_set.append((reviewerID, hist[::-1][:seq_short_len], hist[::-1][seq_short_len:], pos_list[i], 1, seq_short_len,
  167. len(hist[::-1])-seq_short_len, rating_list[i], genres_hist[::-1][:seq_short_len], genres_hist[::-1][seq_short_len:]))
  168. random.shuffle(train_set)
  169. random.shuffle(test_set)
  170. print(len(train_set[0]), len(test_set[0]))
  171. return train_set, test_set
  172. def split(x):
  173. key_ans = x.split(' ')
  174. return list(map(int, key_ans))
  175. def gen_model_input(train_set, user_profile=None, item_profile=None, seq_max_len=50):
  176. train_uid = np.array(list(train_set["userid"].values))
  177. train_uid_org = np.array(list(train_set["mid"].values))
  178. train_seq = list(map(split, train_set["hist_video_list"].values))
  179. train_iid = np.array(list(train_set["pos_video_id"].values))
  180. train_label = np.array(list(train_set["label"].values))
  181. train_hist_len = np.array(list(train_set["hist_list_len"].values))
  182. print("\n\n train_seq")
  183. print(type(train_seq))
  184. # print(train_seq)
  185. # 补充和截断 post从后面,默认pre从前面
  186. train_seq_pad = pad_sequences(train_seq, maxlen=seq_max_len, padding='post', truncating='post', value=0)
  187. print("\n\n train_seq_pad")
  188. print(type(train_seq_pad))
  189. train_model_input = {"mid": train_uid, "videoid": train_iid, "hist_video_id": train_seq_pad,
  190. "hist_len": train_hist_len, "userid_org": train_uid_org}
  191. for key in ["mobileos", "userRatedVideo1", "userRatedVideo2", "userRatedVideo3", "userGenre1", "userGenre2", "userCity", "userRealplayCount",
  192. "videoGenre1", "videoGenre2", "authorid", "videoRealPlayCount", "videoDuration", "videorealplayrate"]:
  193. try:
  194. # train_model_input[key] = user_profile.loc[train_model_input['user_id']][key].values
  195. train_model_input[key] = user_profile.loc[train_model_input['mid']][key].values
  196. except Exception as ex:
  197. print("\n\n gen_model_input exception ", ex)
  198. return None, None
  199. return train_model_input, train_label
  200. def gen_model_input_user_emb(train_set, user_profile=None, item_profile=None, seq_max_len=50):
  201. train_uid = np.array(list(train_set["userid"].values))
  202. train_uid_org = np.array(list(train_set["mid"].values))
  203. train_seq = list(map(split, train_set["hist_video_list"].values))
  204. train_iid = np.array(list(train_set["pos_video_id"].values))
  205. train_label = np.array(list(train_set["label"].values))
  206. train_hist_len = np.array(list(train_set["hist_list_len"].values))
  207. print("\n\n train_seq")
  208. print(type(train_seq))
  209. # print(train_seq)
  210. # 补充和截断 post从后面,默认pre从前面
  211. train_seq_pad = pad_sequences(train_seq, maxlen=seq_max_len, padding='post', truncating='post', value=0)
  212. train_model_input = {"mid": train_uid, "videoid": train_iid, "hist_video_id": train_seq_pad,
  213. "hist_len": train_hist_len, "userid_org": train_uid_org}
  214. for key in ["mobileos", "userRatedVideo1", "userRatedVideo2", "userRatedVideo3", "userGenre1", "userGenre2", "userCity", "userRealplayCount"]:
  215. # "videoGenre1", "authorid", "videoRealPlayCount", "videoDuration"]:
  216. try:
  217. train_model_input[key] = user_profile.loc[train_model_input['mid']][key].values
  218. except Exception as ex:
  219. print("\n\n gen_model_input_user_emb exception ", ex)
  220. return None, None
  221. return train_model_input, train_label
  222. def gen_model_input_sdm(train_set, user_profile, seq_short_len, seq_prefer_len):
  223. train_uid = np.array([line[0] for line in train_set])
  224. short_train_seq = [line[1] for line in train_set]
  225. prefer_train_seq = [line[2] for line in train_set]
  226. train_iid = np.array([line[3] for line in train_set])
  227. train_label = np.array([line[4] for line in train_set])
  228. train_short_len = np.array([line[5] for line in train_set])
  229. train_prefer_len = np.array([line[6] for line in train_set])
  230. short_train_seq_genres = np.array([line[8] for line in train_set])
  231. prefer_train_seq_genres = np.array([line[9] for line in train_set])
  232. train_short_item_pad = pad_sequences(short_train_seq, maxlen=seq_short_len, padding='post', truncating='post',
  233. value=0)
  234. train_prefer_item_pad = pad_sequences(prefer_train_seq, maxlen=seq_prefer_len, padding='post', truncating='post',
  235. value=0)
  236. train_short_genres_pad = pad_sequences(short_train_seq_genres, maxlen=seq_short_len, padding='post', truncating='post',
  237. value=0)
  238. train_prefer_genres_pad = pad_sequences(prefer_train_seq_genres, maxlen=seq_prefer_len, padding='post', truncating='post',
  239. value=0)
  240. train_model_input = {"user_id": train_uid, "movie_id": train_iid, "short_movie_id": train_short_item_pad,
  241. "prefer_movie_id": train_prefer_item_pad, "prefer_sess_length": train_prefer_len, "short_sess_length":
  242. train_short_len, 'short_genres': train_short_genres_pad, 'prefer_genres': train_prefer_genres_pad}
  243. for key in ["gender", "age", "occupation", "zip"]:
  244. train_model_input[key] = user_profile.loc[train_model_input['user_id']][key].values
  245. return train_model_input, train_label