mind_tzld_match.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319
  1. # encoding: utf-8
  2. import pandas as pd
  3. import os
  4. import gc
  5. from deepctr.feature_column import SparseFeat, VarLenSparseFeat, get_feature_names
  6. # from preprocess_tzld210223 import gen_data_set, gen_model_input, gen_model_input_user_emb
  7. # from preprocess_tzld210303 import gen_data_set, gen_model_input, gen_model_input_user_emb
  8. # from preprocess_tzld210315 import gen_data_set, gen_model_input, gen_model_input_user_emb
  9. from preprocess_tzld210322_gen import gen_data_set, gen_model_input, gen_model_input_user_emb
  10. from sklearn.preprocessing import LabelEncoder
  11. from tensorflow.python.keras import backend as K
  12. from tensorflow.python.keras.models import Model
  13. import tensorflow as tf
  14. import numpy as np
  15. from deepmatch.models import *
  16. from deepmatch.utils import sampledsoftmaxloss
  17. count_train = 1
  18. count_test = 1
  19. batch_size = 1024
  20. def generate_arrays_from_train(train_set, user_profile, SEQ_LEN):
  21. # x_y 是我们的训练集包括标签,每一行的第一个是我们的图片路径,后面的是我们的独热化后的标签
  22. # global count
  23. # batch_size = 8
  24. while 1:
  25. for i in range(0, len(train_set), batch_size):
  26. try:
  27. train_batch = train_set[i: i + batch_size]
  28. train_model_input_batch, train_label_batch = gen_model_input(train_batch, user_profile, SEQ_LEN)
  29. if train_model_input_batch is None or train_label_batch is None:
  30. continue
  31. print("train i: " + str(i) + " len train set " + str(len(train_set)))
  32. print(train_model_input_batch)
  33. print(train_label_batch)
  34. yield (train_model_input_batch, train_label_batch)
  35. except Exception as ex:
  36. print("\n\n generate_arrays_from_train exception ", ex)
  37. continue
  38. def generate_arrays_from_train_bak(train_set, user_profile, SEQ_LEN):
  39. global count_train
  40. # batch_size = 8
  41. while True:
  42. try:
  43. train_batch = train_set[(count_train - 1) * batch_size: count_train * batch_size]
  44. train_model_input_batch, train_label_batch = gen_model_input(train_batch, user_profile, SEQ_LEN)
  45. if count_train % 1000 == 0:
  46. print("count:" + str(count_train) + " len train set " + str(len(train_set)))
  47. count_train = count_train + 1
  48. if count_train * batch_size > len(train_set):
  49. count_train = 1
  50. yield (train_model_input_batch, train_label_batch)
  51. except Exception as ex:
  52. print("\n\n generate_arrays_from_file exception ", ex)
  53. count_train = count_train + 1
  54. continue
  55. def generate_arrays_from_test(train_set, user_profile, SEQ_LEN):
  56. # x_y 是我们的训练集包括标签,每一行的第一个是我们的图片路径,后面的是我们的独热化后的标签
  57. # global count
  58. # batch_size = 8
  59. while 1:
  60. for i in range(0, len(train_set), batch_size):
  61. try:
  62. train_batch = train_set[i: i + batch_size]
  63. # train_model_input_batch, train_label_batch = gen_model_input(train_batch, user_profile, SEQ_LEN)
  64. train_model_input_batch, train_label_batch = gen_model_input_user_emb(train_batch, user_profile, SEQ_LEN)
  65. if train_model_input_batch is None or train_label_batch is None:
  66. continue
  67. print("test i: " + str(i) + " len train set " + str(len(train_set)))
  68. yield (train_model_input_batch, train_label_batch)
  69. except Exception as ex:
  70. print("\n\n generate_arrays_from_test exception ", ex)
  71. continue
  72. def generate_arrays_from_test_bak(train_set, user_profile, SEQ_LEN):
  73. global count_test
  74. # batch_size = 8
  75. while True:
  76. try:
  77. train_batch = train_set[(count_test - 1) * batch_size: count_test * batch_size]
  78. train_model_input_batch, train_label_batch = gen_model_input_user_emb(train_batch, user_profile, SEQ_LEN)
  79. if count_test % 1000 == 0:
  80. print("count:" + str(count_test) + " len train set " + str(len(train_set)))
  81. count_test = count_test + 1
  82. if count_test * batch_size > len(train_set):
  83. count_test = 1
  84. yield (train_model_input_batch, train_label_batch)
  85. except Exception as ex:
  86. print("\n\n generate_arrays_from_file exception ", ex)
  87. count_test = count_test + 1
  88. continue
  89. if __name__ == "__main__":
  90. data = pd.read_csvdata = pd.read_csv("/work/xielixun/dwa_sum_graphembedding_user_action_feature_app_20210225.csv")
  91. print(data[0:5])
  92. sparse_features = ["videoid", "mid",
  93. "videoGenre1", "userRatedVideo1", "userGenre1", "userCity",
  94. "authorid", "userRealplayCount", "videoRealPlayCount", "videoDuration"]
  95. SEQ_LEN = 50
  96. #negsample = 3
  97. negsample = 0
  98. features = ["videoid", "mid",
  99. "videoGenre1", "userRatedVideo1", "userGenre1", "userCity",
  100. "authorid", "userRealplayCount", "videoRealPlayCount", "videoDuration"]
  101. feature_max_idx = {}
  102. data["mid"].replace("unknown", "N000111111D", inplace=True)
  103. data = data[data["mid"] != "unknown"].copy()
  104. data = data[data["mid"] != "N000111111D"].copy()
  105. # 和上面函数的功能是一样的,见 deepMatch DSSM
  106. def add_index_column(param_df, column_name):
  107. values = list(param_df[column_name].unique())
  108. value_index_dict = {value: idx for idx, value in enumerate(values)}
  109. if column_name == "mid":
  110. param_df["uidx"] = param_df[column_name].copy()
  111. param_df["mid"] = param_df[column_name].map(value_index_dict)
  112. feature_max_idx["mid"] = param_df["mid"].max() + 1
  113. add_index_column(data, "mid")
  114. feature_max_idx["videoid"] = data["videoid"].max() + 1
  115. for idx, column_name in enumerate(features):
  116. lbe = LabelEncoder()
  117. if column_name == "videoGenre1" or column_name == "videoGenre2" or \
  118. column_name == "videoGenre3" or column_name == "userGenre1" or column_name == "userGenre2" or column_name == "userGenre3":
  119. data[column_name].fillna("社会", inplace=True)
  120. if column_name == "userCity":
  121. data[column_name].fillna("北京", inplace=True)
  122. if column_name == "mid":
  123. continue
  124. data["uidx"] = data[column_name].copy()
  125. data["mid"] = lbe.fit_transform(data[column_name])
  126. feature_max_idx["mid"] = data["mid"].max() + 1
  127. elif column_name == "videoid": # 负采样生成的videoid,没有离散化
  128. continue
  129. data["vidx"] = data[column_name].copy()
  130. data["videoid"] = lbe.fit_transform(data[column_name])
  131. feature_max_idx["videoid"] = data["videoid"].max() + 1
  132. else:
  133. data[column_name] = lbe.fit_transform(data[column_name]) + 1
  134. feature_max_idx[column_name] = data[column_name].max() + 1
  135. user_profile = data[["uidx", "mid", "userRatedVideo1", "userGenre1", "userCity", "userRealplayCount",
  136. "videoGenre1", "authorid", "videoRealPlayCount", "videoDuration"]].drop_duplicates('mid')
  137. user_mid_uid = data[["uidx", "mid"]].drop_duplicates('mid')
  138. user_mid_uid.rename(columns={'mid': 'userid'}, inplace=True)
  139. item_profile = data[
  140. ["videoid", "videoGenre1", "authorid", "videoRealPlayCount", "videoDuration"]].drop_duplicates(
  141. 'videoid')
  142. print("item size is: ", len(item_profile))
  143. user_profile.set_index("mid", inplace=True)
  144. del data
  145. gc.collect()
  146. #test_path = "/work/xielixun/test_user_video_play_test.csv"
  147. #train_path = "/work/xielixun/train_user_video_play.csv"
  148. test_path = "/work/xielixun/test_user_video_play_test_sample_negtive.csv"
  149. train_path = "/work/xielixun/train_user_video_play_sample_negtive.csv"
  150. train_set, test_set = gen_data_set(train_path, test_path, user_mid_uid)
  151. embedding_dim = 16
  152. user_feature_columns = [SparseFeat('mid', feature_max_idx['mid'], embedding_dim),
  153. SparseFeat("userRatedVideo1", feature_max_idx['userRatedVideo1'], embedding_dim),
  154. SparseFeat("userGenre1", feature_max_idx['userGenre1'], embedding_dim),
  155. SparseFeat("userCity", feature_max_idx['userCity'], embedding_dim),
  156. SparseFeat("userRealplayCount", feature_max_idx['userRealplayCount'], embedding_dim),
  157. VarLenSparseFeat(SparseFeat('hist_video_id', feature_max_idx['videoid'], embedding_dim,
  158. embedding_name="videoid"), SEQ_LEN, 'mean', 'hist_len'),
  159. ]
  160. item_feature_columns = [SparseFeat('video_id', feature_max_idx['videoid'], embedding_dim),
  161. SparseFeat("videoGenre1", feature_max_idx["videoGenre1"], embedding_dim),
  162. SparseFeat("authorid", feature_max_idx["authorid"], embedding_dim),
  163. SparseFeat("videoRealPlayCount", feature_max_idx["videoRealPlayCount"], embedding_dim),
  164. SparseFeat("videoDuration", feature_max_idx["videoDuration"], embedding_dim)]
  165. feature_names = get_feature_names(user_feature_columns + item_feature_columns)
  166. # 因为下面这几行没有加,导致了错误 tensorflow.python.framework.errors_impl.InvalidArgumentError: assertion failed: [predictions must be <= 1] [Condition x <= y did not hold element-wise:] [x (functional_1/sampled_softmax_layer/ExpandDims:0) = ] [[0.944198132][1.15184534][1.00592339]...] [y (Cast_4/x:0) = ] [1]
  167. K.set_learning_phase(True)
  168. import tensorflow as tf
  169. if tf.__version__ >= '2.0.0':
  170. tf.compat.v1.disable_eager_execution()
  171. model = MIND(user_feature_columns, item_feature_columns, dynamic_k=False, p=1, k_max=2, num_sampled=5,user_dnn_hidden_units=(64, embedding_dim))
  172. logdir = os.path.join("log_callbacks_mind") # Tensorboard需要一个文件夹
  173. if not os.path.exists(logdir):
  174. os.mkdir(logdir)
  175. output_model_file = os.path.join(logdir,
  176. 'mind_model.h5')
  177. callbacks = [
  178. tf.keras.callbacks.TensorBoard(logdir),
  179. tf.keras.callbacks.ModelCheckpoint(output_model_file,
  180. save_best_only=True),
  181. tf.keras.callbacks.EarlyStopping(patience=5, min_delta=1e-5),
  182. ]
  183. METRICS = [
  184. tf.keras.metrics.TruePositives(name='tp'),
  185. tf.keras.metrics.FalsePositives(name='fp'),
  186. tf.keras.metrics.TrueNegatives(name='tn'),
  187. tf.keras.metrics.FalseNegatives(name='fn'),
  188. tf.keras.metrics.BinaryAccuracy(name='accuracy'),
  189. tf.keras.metrics.Precision(name='precision'),
  190. tf.keras.metrics.Recall(name='recall'),
  191. tf.keras.metrics.AUC(name='auc'),
  192. tf.keras.metrics.AUC(name='auc-ROC', curve='ROC'),
  193. tf.keras.metrics.AUC(name='auc-PRC', curve='PR')
  194. ]
  195. model.compile(
  196. loss='binary_crossentropy',
  197. #loss=sampledsoftmaxloss,
  198. optimizer='adam',
  199. metrics=METRICS
  200. )
  201. model.fit_generator(generate_arrays_from_train(train_set, user_profile, SEQ_LEN),
  202. steps_per_epoch=len(train_set) // batch_size,
  203. #epochs=2, max_queue_size=10, workers=1,
  204. epochs=2, max_queue_size=1, workers=1,
  205. callbacks=callbacks, verbose=1, use_multiprocessing=False)
  206. model.save("./tensorflow_mind-0327-tzld-1.h5")
  207. tf.keras.models.save_model(model,
  208. "file:///work/xielixun/DeepMatch/youtubeDNN/tensorflow_mind-0327-tzld/001",
  209. overwrite=True,
  210. include_optimizer=True,
  211. save_format=None,
  212. signatures=None,
  213. options=None)
  214. all_item_model_input = {"video_id": item_profile['videoid'].values,
  215. "videoGenre1": item_profile['videoGenre1'].values,
  216. "authorid": item_profile['authorid'].values,
  217. "videoRealPlayCount": item_profile['videoRealPlayCount'].values,
  218. "videoDuration": item_profile['videoDuration'].values}
  219. user_embedding_model = Model(inputs=model.user_input, outputs=model.user_embedding)
  220. item_embedding_model = Model(inputs=model.item_input, outputs=model.item_embedding)
  221. user_embs = user_embedding_model.predict_generator(generate_arrays_from_test(test_set, user_profile, SEQ_LEN),
  222. steps=len(test_set) // batch_size,
  223. max_queue_size=10, workers=1, callbacks=callbacks,
  224. verbose=1, use_multiprocessing=False)
  225. item_embs = item_embedding_model.predict(all_item_model_input, batch_size=2 ** 12)
  226. # #### 得到user embedding
  227. user_layer_model = tf.keras.models.Model(
  228. inputs=[model.user_input],
  229. # outputs=model.get_layer("user_embedding").output
  230. outputs=model.user_embedding
  231. )
  232. user_embeddings = []
  233. # #### 得到video embedding
  234. video_layer_model = tf.keras.models.Model(
  235. inputs=[model.item_input],
  236. # outputs=model.get_layer("item_embedding").output
  237. outputs=model.item_embedding
  238. )
  239. video_embeddings = []
  240. for index, row in item_profile.iterrows():
  241. # video_id = row["vidx"]
  242. video_id = row["videoid"]
  243. video_input = [
  244. np.reshape(row["videoid"], [1, 1]),
  245. np.reshape(row["videoGenre1"], [1, 1]),
  246. # np.reshape(row["videoGenre2"], [1, 1]),
  247. np.reshape(row["authorid"], [1, 1]),
  248. #
  249. np.reshape(row["videoRealPlayCount"], [1, 1]),
  250. np.reshape(row["videoDuration"], [1, 1])
  251. ]
  252. video_embedding = video_layer_model(video_input)
  253. embedding_str = ",".join([str(x) for x in video_embedding.numpy().flatten()])
  254. video_embeddings.append([video_id, embedding_str])
  255. df_video_embedding = pd.DataFrame(video_embeddings, columns=["video_id", "video_embedding"])
  256. df_video_embedding.head()
  257. output = "./tensorflow_video_embedding-mind-tzld-210322.csv"
  258. df_video_embedding.to_csv(output, index=False)