WideNDeep_tzld_rank.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257
  1. # encoding: utf-8
  2. import tensorflow as tf
  3. import numpy as np
  4. import pandas as pd
  5. import gc
  6. import os
  7. import time
  8. from sklearn.preprocessing import LabelEncoder
  9. from tensorflow.python.keras.preprocessing.sequence import pad_sequences
  10. begin_time = time.time()
  11. data = pd.read_csv("/work/xielixun/dwa_sum_graphembedding_user_action_feature_app_20210225.csv")
  12. sparse_features = ["videoid", "mid",
  13. "videoGenre1", "videoGenre2", "userRatedVideo1", "userRatedVideo2", "userRatedVideo3", "userGenre1", "userGenre2", "userCity",
  14. "authorid", "userRealplayCount", "videoRealPlayCount", "videoDuration"]
  15. target = ['label']
  16. data = data.sort_values(by='logtimestamp', ascending=False)
  17. # 需要把生成的label id 都保留下来,供后面线上使用,拼接特征
  18. feature_max_idx = {}
  19. data = data[data["mid"] != "unknown"].copy()
  20. data["mid"].replace("unknown", "N000111111D", inplace=True)
  21. data = data[data["mid"] != "N000111111D"].copy()
  22. # 和上面函数的功能是一样的,见 deepMatch DSSM
  23. def add_index_column(param_df, column_name):
  24. values = list(param_df[column_name].unique())
  25. value_index_dict = {value: idx for idx, value in enumerate(values)}
  26. if column_name == "mid":
  27. param_df["uidx"] = param_df[column_name].copy()
  28. param_df["mid"] = param_df[column_name].map(value_index_dict) + 1
  29. feature_max_idx["mid"] = param_df["mid"].max() + 1
  30. add_index_column(data, "mid")
  31. for column_name in sparse_features:
  32. lbe = LabelEncoder()
  33. print("\n\n-------------- " + column_name)
  34. print(data[column_name])
  35. if column_name == "videoGenre1" or column_name == "videoGenre2" or \
  36. column_name == "videoGenre3" or column_name == "userGenre1" or column_name == "userGenre2" or column_name == "userGenre3":
  37. data[column_name].fillna("社会", inplace=True)
  38. continue
  39. if column_name == "userCity":
  40. data[column_name].fillna("北京", inplace=True)
  41. if column_name == "mid":
  42. continue
  43. if column_name == "videoid":
  44. data["vidx"] = data[column_name].copy()
  45. data["videoid"] = lbe.fit_transform(data[column_name]) + 1
  46. feature_max_idx["videoid"] = data["videoid"].max() + 1
  47. # print(data["videoid"])
  48. else:
  49. data[column_name] = lbe.fit_transform(data[column_name])
  50. feature_max_idx[column_name] = data[column_name].max() + 1
  51. key2index = {}
  52. print("\n\n ************ data process finish")
  53. user_video_list_df = data[data["label"] > 0].copy().groupby("mid")['videoid'].apply(list).reset_index()
  54. user_video_list_df.rename(columns={'videoid': 'hist_video_id'}, inplace=True)
  55. max_len = 50
  56. mid_list = list(user_video_list_df["mid"])
  57. user_video_list_df["hist_len"] = user_video_list_df["hist_video_id"].apply(lambda x: len(x))
  58. emb_dim = 10
  59. df_merge = pd.merge(left=data,
  60. right=user_video_list_df,
  61. left_on="mid",
  62. right_on="mid",
  63. how="right")
  64. df_merge.head()
  65. df_merge = df_merge.sample(frac=1.0)
  66. g1 = data["videoGenre1"].values
  67. g2 = data["videoGenre2"].values
  68. g4 = data["userGenre1"].values
  69. g5 = data["userGenre2"].values
  70. del data, user_video_list_df
  71. gc.collect()
  72. model_input = {name: df_merge[name] for name in sparse_features}
  73. video_hist_seq_pad = pad_sequences(df_merge["hist_video_id"], maxlen=max_len, padding='post', truncating='post', value=0)
  74. model_input["hist_video_id"] = video_hist_seq_pad
  75. model_input["hist_len"] = df_merge["hist_len"]
  76. genre_set = g1 + g2 + g4 + g5
  77. genre_vocab = list(set(genre_set))
  78. GENRE_FEATURES = {
  79. 'userGenre1': genre_vocab,
  80. 'userGenre2': genre_vocab,
  81. 'videoGenre1': genre_vocab,
  82. 'videoGenre2': genre_vocab,
  83. }
  84. emb_dim1 = 10
  85. emb_dim2 = 32
  86. categorical_columns = []
  87. for feature, vocab in GENRE_FEATURES.items():
  88. cat_col = tf.feature_column.categorical_column_with_vocabulary_list(
  89. key=feature, vocabulary_list=vocab)
  90. emb_col = tf.feature_column.embedding_column(cat_col, emb_dim1)
  91. categorical_columns.append(emb_col)
  92. # video id embedding feature
  93. video_col = tf.feature_column.categorical_column_with_identity(key='videoid', num_buckets=feature_max_idx["videoid"])
  94. video_emb_col = tf.feature_column.embedding_column(video_col, emb_dim2)
  95. categorical_columns.append(video_emb_col)
  96. # user id embedding feature
  97. user_col = tf.feature_column.categorical_column_with_identity(key='mid', num_buckets=feature_max_idx["mid"])
  98. # user_emb_col = tf.feature_column.embedding_column(user_col, 10)
  99. user_emb_col = tf.feature_column.embedding_column(user_col, emb_dim2)
  100. categorical_columns.append(user_emb_col)
  101. user_city_col = tf.feature_column.categorical_column_with_identity(key='userCity', num_buckets=feature_max_idx["userCity"])
  102. user_city_emb_col = tf.feature_column.embedding_column(user_city_col, emb_dim1)
  103. categorical_columns.append(user_city_emb_col)
  104. authorid_col = tf.feature_column.categorical_column_with_identity(key='authorid', num_buckets=feature_max_idx["authorid"])
  105. authorid_emb_col = tf.feature_column.embedding_column(authorid_col, emb_dim2)
  106. categorical_columns.append(authorid_emb_col)
  107. numerical_columns = [
  108. tf.feature_column.numeric_column("userRealplayCount"),
  109. tf.feature_column.numeric_column("videoRealPlayCount"),
  110. tf.feature_column.numeric_column("videoDuration")
  111. ]
  112. # cross feature between current video and user historical video
  113. rated_video1 = tf.feature_column.categorical_column_with_identity(key='userRatedVideo1', num_buckets=feature_max_idx["videoid"])
  114. rated_video2 = tf.feature_column.categorical_column_with_identity(key='userRatedVideo2', num_buckets=feature_max_idx["videoid"])
  115. rated_video3 = tf.feature_column.categorical_column_with_identity(key='userRatedVideo3', num_buckets=feature_max_idx["videoid"])
  116. crossed_feature = tf.feature_column.indicator_column(tf.feature_column.crossed_column([video_col, rated_video1, rated_video2, rated_video3], feature_max_idx["videoid"]))
  117. user_features = ["uidx", "mid",
  118. "userRatedVideo1", "userRatedVideo2", "userRatedVideo3",
  119. "userGenre1", "userGenre2", "userCity",
  120. "userRealplayCount", "hist_video_id", "hist_len"]
  121. item_features = ["vidx","videoid", "videoGenre1", "videoGenre2",
  122. "authorid", "videoRealPlayCount", "videoDuration"]
  123. df_merge["hist_video_id"] = list(map(lambda x: ' '.join(list(map(str, x))), video_hist_seq_pad)) # 上下两条功能相同
  124. user_df = df_merge[user_features].drop_duplicates('uidx')
  125. video_df = df_merge[item_features].drop_duplicates('vidx')
  126. df_target = df_merge[target].copy()
  127. del df_merge
  128. gc.collect()
  129. from datetime import datetime
  130. TIMESTAMP = "{0:%Y-%m-%dT%H-%M-%S}".format(datetime.now())
  131. user_df.to_csv("user_df" + TIMESTAMP + ".csv", index=False)
  132. video_df.to_csv("video_df" + TIMESTAMP + ".csv", index=False)
  133. del user_df, video_df
  134. gc.collect()
  135. inputs = {
  136. 'videoid': tf.keras.layers.Input(name='videoid', shape=(), dtype='int32'),
  137. 'mid': tf.keras.layers.Input(name='mid', shape=(), dtype='int32'),
  138. 'userRatedVideo1': tf.keras.layers.Input(name='userRatedVideo1', shape=(), dtype='int32'),
  139. 'userRatedVideo2': tf.keras.layers.Input(name='userRatedVideo2', shape=(), dtype='int32'),
  140. 'userRatedVideo3': tf.keras.layers.Input(name='userRatedVideo3', shape=(), dtype='int32'),
  141. 'userRealplayCount': tf.keras.layers.Input(name='userRealplayCount', shape=(), dtype='int32'),
  142. 'videoRealPlayCount': tf.keras.layers.Input(name='videoRealPlayCount', shape=(), dtype='int32'),
  143. 'videoDuration': tf.keras.layers.Input(name='videoDuration', shape=(), dtype='int32'),
  144. "authorid": tf.keras.layers.Input(name='authorid', shape=(), dtype='int32'),
  145. "userCity": tf.keras.layers.Input(name='userCity', shape=(), dtype='int32'),
  146. 'userGenre1': tf.keras.layers.Input(name='userGenre1', shape=(), dtype='string'),
  147. 'userGenre2': tf.keras.layers.Input(name='userGenre2', shape=(), dtype='string'),
  148. 'videoGenre1': tf.keras.layers.Input(name='videoGenre1', shape=(), dtype='string'),
  149. 'videoGenre2': tf.keras.layers.Input(name='videoGenre2', shape=(), dtype='string'),
  150. 'hist_video_id':tf.keras.Input(shape=(50,), name='hist_video_id', dtype='int32'),
  151. 'hist_len': tf.keras.Input(shape=(), name='hist_len', dtype='int32'),
  152. }
  153. from tensorflow.python.feature_column.feature_column import _LazyBuilder
  154. hist_video_column = tf.feature_column.categorical_column_with_hash_bucket('hist_video_id', 2 * feature_max_idx["videoid"], dtype=tf.int32)
  155. hist_video_embedded = tf.feature_column.embedding_column(hist_video_column, emb_dim2)
  156. categorical_columns.append(hist_video_embedded)
  157. deep = tf.keras.layers.DenseFeatures(numerical_columns + categorical_columns)(inputs)
  158. deep = tf.keras.layers.Dense(128, activation='relu')(deep)
  159. deep = tf.keras.layers.Dense(64, activation='relu')(deep)
  160. # wide part for cross feature
  161. wide = tf.keras.layers.DenseFeatures(crossed_feature)(inputs)
  162. both = tf.keras.layers.concatenate([deep, wide])
  163. output_layer = tf.keras.layers.Dense(1, activation='sigmoid')(both)
  164. model = tf.keras.Model(inputs, output_layer)
  165. logdir = os.path.join("log_callbacks") # Tensorboard需要一个文件夹
  166. if not os.path.exists(logdir):
  167. os.mkdir(logdir)
  168. output_model_file = os.path.join(logdir,
  169. 'wdl_model.h5')
  170. callbacks = [
  171. tf.keras.callbacks.TensorBoard(logdir),
  172. tf.keras.callbacks.ModelCheckpoint(output_model_file,
  173. save_best_only=True),
  174. tf.keras.callbacks.EarlyStopping(patience=5, min_delta=1e-3),
  175. ]
  176. METRICS = [
  177. tf.keras.metrics.TruePositives(name='tp'),
  178. tf.keras.metrics.FalsePositives(name='fp'),
  179. tf.keras.metrics.TrueNegatives(name='tn'),
  180. tf.keras.metrics.FalseNegatives(name='fn'),
  181. tf.keras.metrics.BinaryAccuracy(name='accuracy'),
  182. tf.keras.metrics.Precision(name='precision'),
  183. tf.keras.metrics.Recall(name='recall'),
  184. tf.keras.metrics.AUC(name='auc'),
  185. tf.keras.metrics.AUC(name='auc-ROC', curve='ROC'),
  186. tf.keras.metrics.AUC(name='auc-PRC', curve='PR')
  187. ]
  188. model.compile(
  189. loss='binary_crossentropy',
  190. optimizer='adam',
  191. metrics=METRICS
  192. )
  193. model.fit(model_input, df_target,
  194. batch_size=4096, epochs=1,
  195. verbose=2, validation_split=0.2, callbacks=callbacks)
  196. model.save("./tensorflow_WDL-0416-tzld-2.h5")
  197. tf.keras.models.save_model(model,
  198. "file:///work/xielixun/WDL_0416/tensorflow_WDL-0416-tzld/001",
  199. # "file:///Users/xielixun/Desktop/lixunxie/python_code/DeepMatch/examples/WDL_0416/tensorflow_WDL-0416-tzld/001",
  200. overwrite=True,
  201. include_optimizer=True,
  202. save_format=None,
  203. signatures=None,
  204. options=None)
  205. print("trainning wDL in 2021-0415 cost time: " + str(time.time() - begin_time))