Dssm_tzld_match.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291
  1. # encoding: utf-8
  2. import pandas as pd
  3. import os
  4. import gc
  5. import time
  6. from deepctr.feature_column import SparseFeat, VarLenSparseFeat, get_feature_names
  7. # from preprocess_tzld210322_gen import gen_data_set, gen_model_input, gen_model_input_user_emb
  8. # from preprocess_tzld210327_gen import gen_data_set, gen_model_input, gen_model_input_user_emb
  9. from preprocess_tzld210423_gen import gen_data_set, gen_model_input, gen_model_input_user_emb
  10. from sklearn.preprocessing import LabelEncoder
  11. from tensorflow.python.keras import backend as K
  12. from tensorflow.python.keras.models import Model
  13. import tensorflow as tf
  14. import numpy as np
  15. from deepmatch.models import *
  16. count_train = 1
  17. count_test = 1
  18. batch_size = 4096
  19. def generate_arrays_from_train(train_set, user_profile, SEQ_LEN):
  20. # x_y 是我们的训练集包括标签,每一行的第一个是我们的图片路径,后面的是我们的独热化后的标签
  21. while 1:
  22. for i in range(0, len(train_set), batch_size):
  23. try:
  24. train_batch = train_set[i: i + batch_size]
  25. train_model_input_batch, train_label_batch = gen_model_input(train_batch, user_profile, SEQ_LEN)
  26. if train_model_input_batch is None or train_label_batch is None:
  27. continue
  28. print("train i: " + str(i) + " len train set " + str(len(train_set)))
  29. print(train_model_input_batch)
  30. print(train_label_batch)
  31. yield (train_model_input_batch, train_label_batch)
  32. except Exception as ex:
  33. print("\n\n generate_arrays_from_train exception ", ex)
  34. continue
  35. def generate_arrays_from_train_bak(train_set, user_profile, SEQ_LEN):
  36. global count_train
  37. while True:
  38. try:
  39. train_batch = train_set[(count_train - 1) * batch_size: count_train * batch_size]
  40. train_model_input_batch, train_label_batch = gen_model_input(train_batch, user_profile, SEQ_LEN)
  41. if count_train % 1000 == 0:
  42. print("count:" + str(count_train) + " len train set " + str(len(train_set)))
  43. count_train = count_train + 1
  44. if count_train * batch_size > len(train_set):
  45. count_train = 1
  46. yield (train_model_input_batch, train_label_batch)
  47. except Exception as ex:
  48. print("\n\n generate_arrays_from_file exception ", ex)
  49. count_train = count_train + 1
  50. continue
  51. def generate_arrays_from_test(train_set, user_profile, SEQ_LEN):
  52. # x_y 是我们的训练集包括标签,每一行的第一个是我们的图片路径,后面的是我们的独热化后的标签
  53. while 1:
  54. for i in range(0, len(train_set), batch_size):
  55. try:
  56. train_batch = train_set[i: i + batch_size]
  57. # train_model_input_batch, train_label_batch = gen_model_input(train_batch, user_profile, SEQ_LEN)
  58. train_model_input_batch, train_label_batch = gen_model_input_user_emb(train_batch, user_profile, SEQ_LEN)
  59. if train_model_input_batch is None or train_label_batch is None:
  60. continue
  61. print("test i: " + str(i) + " len train set " + str(len(train_set)))
  62. yield (train_model_input_batch, train_label_batch)
  63. except Exception as ex:
  64. print("\n\n generate_arrays_from_test exception ", ex)
  65. continue
  66. def generate_arrays_from_test_bak(train_set, user_profile, SEQ_LEN):
  67. global count_test
  68. while True:
  69. try:
  70. train_batch = train_set[(count_test - 1) * batch_size: count_test * batch_size]
  71. train_model_input_batch, train_label_batch = gen_model_input_user_emb(train_batch, user_profile, SEQ_LEN)
  72. if count_test % 1000 == 0:
  73. print("count:" + str(count_test) + " len train set " + str(len(train_set)))
  74. count_test = count_test + 1
  75. if count_test * batch_size > len(train_set):
  76. count_test = 1
  77. yield (train_model_input_batch, train_label_batch)
  78. except Exception as ex:
  79. print("\n\n generate_arrays_from_file exception ", ex)
  80. count_test = count_test + 1
  81. continue
  82. if __name__ == "__main__":
  83. begin_time = time.time()
  84. data = pd.read_csvdata = pd.read_csv("/work/xielixun/user_action_feature_wdl_recall_app20210422.csv")
  85. print(data[0:5])
  86. sparse_features = ["videoid", "mid", "mobileos",
  87. "videoGenre1", "videoGenre2", "userRatedVideo1", "userRatedVideo2", "userRatedVideo3", "userGenre1", "userGenre2", "userCity",
  88. "authorid", "userRealplayCount", "videoRealPlayCount", "videoDuration", "videorealplayrate"]
  89. SEQ_LEN = 50
  90. negsample = 3
  91. features = ["videoid", "mid", "mobileos",
  92. "videoGenre1", "videoGenre2", "userRatedVideo1", "userRatedVideo2", "userRatedVideo3", "userGenre1", "userGenre2", "userCity",
  93. "authorid", "userRealplayCount", "videoRealPlayCount", "videoDuration", "videorealplayrate"]
  94. feature_max_idx = {}
  95. data["mid"].replace("unknown", "N000111111D", inplace=True)
  96. data = data[data["mid"] != "unknown"].copy()
  97. data = data[data["mid"] != "N000111111D"].copy()
  98. # 和上面函数的功能是一样的,见 deepMatch DSSM
  99. def add_index_column(param_df, column_name):
  100. values = list(param_df[column_name].unique())
  101. value_index_dict = {value: idx for idx, value in enumerate(values)}
  102. if column_name == "mid":
  103. param_df["uidx"] = param_df[column_name].copy()
  104. param_df["mid"] = param_df[column_name].map(value_index_dict) + 1
  105. feature_max_idx["mid"] = param_df["mid"].max() + 1
  106. add_index_column(data, "mid")
  107. feature_max_idx["videoid"] = data["videoid"].max() + 1
  108. for idx, column_name in enumerate(features):
  109. lbe = LabelEncoder()
  110. if column_name == "videoGenre1" or column_name == "videoGenre2" or \
  111. column_name == "videoGenre3" or column_name == "userGenre1" or column_name == "userGenre2" or column_name == "userGenre3":
  112. data[column_name].fillna("社会", inplace=True)
  113. if column_name == "userCity":
  114. data[column_name].fillna("北京", inplace=True)
  115. if column_name == "mid":
  116. continue
  117. data["uidx"] = data[column_name].copy()
  118. data["mid"] = lbe.fit_transform(data[column_name])
  119. feature_max_idx["mid"] = data["mid"].max() + 1
  120. elif column_name == "videoid": # 负采样生成的videoid,没有离散化
  121. continue
  122. data["vidx"] = data[column_name].copy()
  123. data["videoid"] = lbe.fit_transform(data[column_name])
  124. feature_max_idx["videoid"] = data["videoid"].max() + 1
  125. else:
  126. data[column_name] = lbe.fit_transform(data[column_name])
  127. feature_max_idx[column_name] = data[column_name].max() + 1
  128. user_profile = data[["uidx", "mid", "mobileos", "userRatedVideo1", "userRatedVideo2", "userRatedVideo3", "userGenre1", "userGenre2", "userCity", "userRealplayCount",
  129. "videoGenre1", "videoGenre2", "authorid", "videoRealPlayCount", "videoDuration", "videorealplayrate"]].drop_duplicates('mid')
  130. user_mid_uid = data[["uidx", "mid"]].drop_duplicates('mid')
  131. user_mid_uid.rename(columns={'mid': 'userid'}, inplace=True)
  132. item_profile = data[
  133. ["videoid", "videoGenre1", "videoGenre2", "authorid", "videoRealPlayCount", "videoDuration", "videorealplayrate"]].drop_duplicates(
  134. 'videoid')
  135. print("item size is: ", len(item_profile))
  136. user_profile.set_index("mid", inplace=True)
  137. print(data)
  138. print("\n\n after group by mid videoid")
  139. # print(data)
  140. del data
  141. gc.collect()
  142. # 按序列采样,没有加负采样,会再训练时的batch里进行batch负采样
  143. # test_path = "/work/xielixun/test_user_video_play_test.csv"
  144. # train_path = "/work/xielixun/train_user_video_play.csv"
  145. # 负采样数据
  146. # test_path = "/work/xielixun/test_user_video_play_test_sample_negtive.csv"
  147. # train_path = "/work/xielixun/train_user_video_play_sample_negtive.csv"
  148. test_path = "/work/xielixun/sampleNegtive/test_set_0422.csv"
  149. train_path = "/work/xielixun/sampleNegtive/train_set_0422.csv"
  150. train_set, test_set = gen_data_set(train_path, test_path, user_mid_uid)
  151. embedding_dim = 32
  152. user_feature_columns = [SparseFeat('mid', feature_max_idx['mid'], embedding_dim),
  153. SparseFeat("mobileos", feature_max_idx["mobileos"], embedding_dim),
  154. SparseFeat("userRatedVideo1", feature_max_idx['userRatedVideo1'], embedding_dim),
  155. SparseFeat("userRatedVideo2", feature_max_idx['userRatedVideo2'], embedding_dim),
  156. SparseFeat("userRatedVideo3", feature_max_idx['userRatedVideo3'], embedding_dim),
  157. SparseFeat("userGenre1", feature_max_idx['userGenre1'], embedding_dim),
  158. SparseFeat("userGenre2", feature_max_idx['userGenre2'], embedding_dim),
  159. SparseFeat("userCity", feature_max_idx['userCity'], embedding_dim),
  160. SparseFeat("userRealplayCount", feature_max_idx['userRealplayCount'], embedding_dim),
  161. VarLenSparseFeat(SparseFeat('hist_video_id', feature_max_idx['videoid'], embedding_dim,
  162. embedding_name="videoid"), SEQ_LEN, 'mean', 'hist_len'),
  163. ]
  164. item_feature_columns = [SparseFeat('videoid', feature_max_idx['videoid'], embedding_dim),
  165. SparseFeat("videoGenre1", feature_max_idx["videoGenre1"], embedding_dim),
  166. SparseFeat("videoGenre2", feature_max_idx["videoGenre2"], embedding_dim),
  167. SparseFeat("authorid", feature_max_idx["authorid"], embedding_dim),
  168. SparseFeat("videoRealPlayCount", feature_max_idx["videoRealPlayCount"], embedding_dim),
  169. SparseFeat("videoDuration", feature_max_idx["videoDuration"], embedding_dim),
  170. SparseFeat("videorealplayrate", feature_max_idx["videorealplayrate"], embedding_dim)]
  171. feature_names = get_feature_names(user_feature_columns + item_feature_columns)
  172. print("\n\nfeature_names is: ")
  173. print(feature_names)
  174. # 因为下面这几行没有加,youtubeDNN导致了错误 tensorflow.python.framework.errors_impl.InvalidArgumentError: assertion failed: [predictions must be <= 1] [Condition x <= y did not hold element-wise:] [x (functional_1/sampled_softmax_layer/ExpandDims:0) = ] [[0.944198132][1.15184534][1.00592339]...] [y (Cast_4/x:0) = ] [1]
  175. K.set_learning_phase(True)
  176. import tensorflow as tf
  177. if tf.__version__ >= '2.0.0':
  178. tf.compat.v1.disable_eager_execution()
  179. model = DSSM(user_feature_columns, item_feature_columns)
  180. logdir = os.path.join("log_callbacks_dssm") # Tensorboard需要一个文件夹
  181. if not os.path.exists(logdir):
  182. os.mkdir(logdir)
  183. output_model_file = os.path.join(logdir,
  184. 'dssm_model.h5')
  185. callbacks = [
  186. tf.keras.callbacks.TensorBoard(logdir),
  187. tf.keras.callbacks.ModelCheckpoint(output_model_file,
  188. save_best_only=True),
  189. tf.keras.callbacks.EarlyStopping(patience=5, min_delta=1e-5),
  190. ]
  191. METRICS = [
  192. tf.keras.metrics.TruePositives(name='tp'),
  193. tf.keras.metrics.FalsePositives(name='fp'),
  194. tf.keras.metrics.TrueNegatives(name='tn'),
  195. tf.keras.metrics.FalseNegatives(name='fn'),
  196. tf.keras.metrics.BinaryAccuracy(name='accuracy'),
  197. tf.keras.metrics.Precision(name='precision'),
  198. tf.keras.metrics.Recall(name='recall'),
  199. tf.keras.metrics.AUC(name='auc'),
  200. tf.keras.metrics.AUC(name='auc-ROC', curve='ROC'),
  201. tf.keras.metrics.AUC(name='auc-PRC', curve='PR')
  202. ]
  203. # compile the model, set loss function, optimizer and evaluation metrics
  204. model.compile(
  205. loss='binary_crossentropy',
  206. optimizer='adam',
  207. metrics=METRICS
  208. )
  209. model.fit_generator(generate_arrays_from_train(train_set, user_profile, SEQ_LEN),
  210. steps_per_epoch=len(train_set) // batch_size,
  211. epochs=2, max_queue_size=10, workers=1,
  212. callbacks=callbacks, verbose=1, use_multiprocessing=False)
  213. model.save("./tensorflow_DSSM-027-tzld-1.h5")
  214. all_item_model_input = {"videoid": item_profile['videoid'].values,
  215. "videoGenre1": item_profile['videoGenre1'].values,
  216. "videoGenre2": item_profile['videoGenre2'].values,
  217. "authorid": item_profile['authorid'].values,
  218. "videoRealPlayCount": item_profile['videoRealPlayCount'].values,
  219. "videoDuration": item_profile['videoDuration'].values,
  220. "videorealplayrate": item_profile['videorealplayrate'].values}
  221. user_embedding_model = Model(inputs=model.user_input, outputs=model.user_embedding)
  222. item_embedding_model = Model(inputs=model.item_input, outputs=model.item_embedding)
  223. user_embs = user_embedding_model.predict_generator(generate_arrays_from_test(test_set, user_profile, SEQ_LEN),
  224. steps=len(test_set) // batch_size,
  225. max_queue_size=1, workers=1, callbacks=callbacks,
  226. verbose=1, use_multiprocessing=False)
  227. test_model_input_emb2, test_model_label2 = gen_model_input_user_emb(test_set, user_profile, SEQ_LEN)
  228. user_embs2 = user_embedding_model.predict(test_model_input_emb2, batch_size=2 ** 12)
  229. item_embs = item_embedding_model.predict(all_item_model_input, batch_size=2 ** 12)
  230. # test_set里带有mid的原始值,可以取出作为生成embedding的key mid
  231. user_embs = list(map(str, user_embs2))
  232. user_res = {"mid": test_model_input_emb2["userid_org"], "emb": user_embs}
  233. user_embs_res = pd.DataFrame.from_dict(user_res)
  234. output_user = "./tensorflow_user_embedding-dssm-tzld-210423.csv"
  235. user_embs_res.to_csv(output_user, index=False)
  236. item_embs = list(map(str, item_embs))
  237. item_res = {"videoid": all_item_model_input["videoid"], "emb": item_embs}
  238. item_embs_res = pd.DataFrame.from_dict(item_res)
  239. output_item = "./tensorflow_video_embedding-dssm-tzld-210423.csv"
  240. item_embs_res.to_csv(output_item, index=False)
  241. print("************ 21-04-23 train dssm cost time : " + str(time.time() - begin_time))