xdeepfm_tzld_rank.py 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219
  1. import numpy as np
  2. import pandas as pd
  3. import gc
  4. import os
  5. import time
  6. from sklearn.preprocessing import LabelEncoder
  7. from tensorflow.python.keras.preprocessing.sequence import pad_sequences
  8. from deepctr.feature_column import SparseFeat, VarLenSparseFeat, get_feature_names
  9. import tensorflow as tf
  10. from deepctr.models import xDeepFM
  11. def split(x):
  12. key_ans = x.split('|')
  13. for key in key_ans:
  14. if key not in key2index:
  15. # Notice : input value 0 is a special "padding",so we do not use 0 to encode valid feature for sequence input
  16. key2index[key] = len(key2index) + 1
  17. return list(map(lambda x: key2index[x], key_ans))
  18. if __name__ == "__main__":
  19. begin_time = time.time()
  20. data = pd.read_csv("/work/xielixun/dwa_sum_graphembedding_user_action_feature_app_20210225.csv")
  21. sparse_features = ["videoid", "mid",
  22. "videoGenre1", "videoGenre2", "userRatedVideo1", "userRatedVideo2", "userRatedVideo3", "userGenre1", "userGenre2", "userCity",
  23. "authorid", "userRealplayCount", "videoRealPlayCount", "videoDuration"]
  24. # target = ['rating']
  25. target = ['label']
  26. feature_max_idx = {}
  27. data = data[data["mid"] != "unknown"].copy()
  28. data["mid"].replace("unknown", "N000111111D", inplace=True)
  29. data = data[data["mid"] != "N000111111D"].copy()
  30. # 和上面函数的功能是一样的,见 deepMatch DSSM
  31. def add_index_column(param_df, column_name):
  32. values = list(param_df[column_name].unique())
  33. value_index_dict = {value: idx for idx, value in enumerate(values)}
  34. if column_name == "mid":
  35. param_df["uidx"] = param_df[column_name].copy()
  36. param_df["mid"] = param_df[column_name].map(value_index_dict)
  37. feature_max_idx["mid"] = param_df["mid"].max() + 1
  38. add_index_column(data, "mid")
  39. feature_max_idx["videoid"] = data["videoid"].max() + 1
  40. for column_name in sparse_features:
  41. lbe = LabelEncoder()
  42. print("\n\n-------------- " + column_name)
  43. print(data[column_name])
  44. if column_name == "videoGenre1" or column_name == "videoGenre2" or \
  45. column_name == "videoGenre3" or column_name == "userGenre1" or column_name == "userGenre2" or column_name == "userGenre3":
  46. data[column_name].fillna("社会", inplace=True)
  47. if column_name == "userCity":
  48. data[column_name].fillna("北京", inplace=True)
  49. if column_name == "mid":
  50. continue
  51. if column_name == "videoid":
  52. continue
  53. data[column_name] = lbe.fit_transform(data[column_name])
  54. feature_max_idx[column_name] = data[column_name].max() + 1
  55. key2index = {}
  56. print("\n\n ************ data process finish")
  57. user_video_list_df = data[data["label"] > 0].copy().groupby("mid")['videoid'].apply(list).reset_index()
  58. user_video_list_df.rename(columns={'videoid': 'hist_video_id'}, inplace=True)
  59. print(user_video_list_df)
  60. print(type(user_video_list_df))
  61. max_len = 50
  62. # print(list(user_video_length.keys()))
  63. # print(list(user_video_length.keys))
  64. mid_list = list(user_video_list_df["mid"])
  65. print(user_video_list_df["mid"])
  66. user_video_list_df["hist_len"] = user_video_list_df["hist_video_id"].apply(lambda x: len(x))
  67. print(user_video_list_df)
  68. print(len(user_video_list_df))
  69. emb_dim = 10
  70. fixlen_feature_columns = [SparseFeat(feat, feature_max_idx[feat], embedding_dim=emb_dim)
  71. for feat in sparse_features]
  72. print(fixlen_feature_columns)
  73. use_weighted_sequence = False
  74. if use_weighted_sequence:
  75. varlen_feature_columns = [VarLenSparseFeat(SparseFeat('genres', vocabulary_size=len(
  76. key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean',
  77. weight_name='genres_weight')] # Notice : value 0 is for padding for sequence input feature
  78. else:
  79. # varlen_feature_columns = [VarLenSparseFeat(SparseFeat('genres', vocabulary_size=len(
  80. # key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean',
  81. # weight_name=None)] # Notice : value 0 is for padding for sequence input feature
  82. varlen_feature_columns = [VarLenSparseFeat(SparseFeat('hist_video_id', vocabulary_size=feature_max_idx["videoid"], embedding_dim=emb_dim,
  83. embedding_name="videoid"), maxlen=max_len, combiner='mean',
  84. length_name="hist_len")] # Notice : value 0 is for padding for sequence input feature
  85. print(varlen_feature_columns)
  86. #linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
  87. linear_feature_columns = fixlen_feature_columns
  88. dnn_feature_columns = fixlen_feature_columns #+ varlen_feature_columns
  89. print(dnn_feature_columns)
  90. feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
  91. print("data len is: ", len(data))
  92. df_merge = pd.merge(left=data,
  93. right=user_video_list_df,
  94. left_on="mid",
  95. right_on="mid",
  96. #how="right")
  97. how="inner")
  98. df_merge.head()
  99. print("df_merge len is: ", len(df_merge))
  100. df_merge = df_merge.sample(frac=1.0)
  101. #df_merge = df_merge.sample(frac=0.3)
  102. del data, user_video_list_df
  103. gc.collect()
  104. print("after sample df_merge len is: ", len(df_merge))
  105. model_input = {name: df_merge[name] for name in sparse_features} #
  106. # df_merge["hist_video_id"].fillna(str([0, 0, 0]), inplace=True)
  107. # df_merge["hist_len"].fillna(0, inplace=True)
  108. print(df_merge)
  109. video_hist_seq_pad = pad_sequences(df_merge["hist_video_id"], maxlen=max_len, padding='post', truncating='post', value=0)
  110. model_input["hist_video_id"] = video_hist_seq_pad
  111. print("\n\n\n")
  112. print(video_hist_seq_pad)
  113. print("\n\nuser_vids_input len is: ", len(df_merge["hist_video_id"]))
  114. model_input["hist_len"] = df_merge["hist_len"]
  115. print("\n\nuser_vids_len_input len is: ", len(df_merge["hist_len"]))
  116. user_features = ["uidx", "mid",
  117. "userRatedVideo1", "userRatedVideo2", "userRatedVideo3",
  118. "userGenre1", "userGenre2", "userCity",
  119. "userRealplayCount", "hist_video_id", "hist_len"]
  120. item_features = ["videoid", "videoGenre1", "videoGenre2",
  121. "authorid", "videoRealPlayCount", "videoDuration"]
  122. df_merge["hist_video_id"] = list(map(lambda x: ' '.join(list(map(str, x))), video_hist_seq_pad)) # 上下两条功能相同
  123. user_df = df_merge[user_features].drop_duplicates('uidx')
  124. video_df = df_merge[item_features].drop_duplicates('videoid')
  125. from datetime import datetime
  126. TIMESTAMP = "{0:%Y-%m-%dT%H-%M-%S}".format(datetime.now())
  127. user_df.to_csv("user_df" + TIMESTAMP + ".csv", index=False)
  128. video_df.to_csv("video_df" + TIMESTAMP + ".csv", index=False)
  129. model = xDeepFM(linear_feature_columns, dnn_feature_columns, task='binary')
  130. logdir = os.path.join("log_callbacks") # Tensorboard需要一个文件夹
  131. if not os.path.exists(logdir):
  132. os.mkdir(logdir)
  133. output_model_file = os.path.join(logdir,
  134. 'xdeepfm_model.h5')
  135. callbacks = [
  136. tf.keras.callbacks.TensorBoard(logdir),
  137. tf.keras.callbacks.ModelCheckpoint(output_model_file,
  138. save_best_only=True),
  139. tf.keras.callbacks.EarlyStopping(patience=5, min_delta=1e-3),
  140. ]
  141. METRICS = [
  142. tf.keras.metrics.TruePositives(name='tp'),
  143. tf.keras.metrics.FalsePositives(name='fp'),
  144. tf.keras.metrics.TrueNegatives(name='tn'),
  145. tf.keras.metrics.FalseNegatives(name='fn'),
  146. tf.keras.metrics.BinaryAccuracy(name='accuracy'),
  147. tf.keras.metrics.Precision(name='precision'),
  148. tf.keras.metrics.Recall(name='recall'),
  149. # tf.keras.metrics.AUC(name='auc'),
  150. tf.keras.metrics.AUC(name='auc-ROC', curve='ROC'),
  151. tf.keras.metrics.AUC(name='auc-PRC', curve='PR')
  152. ]
  153. model.compile(
  154. loss='binary_crossentropy',
  155. optimizer='adam',
  156. metrics=METRICS
  157. )
  158. model.fit(model_input, df_merge[target].values,
  159. batch_size=4096, epochs=2,
  160. verbose=2, validation_split=0.2, callbacks=callbacks)
  161. model.save("./tensorflow_xdeepfm-0325-tzld-2.h5")
  162. tf.keras.models.save_model(model,
  163. "file:///work/xielixun/xDeepFM0325/tensorflow_xdeepfm-0325-tzld/001",
  164. overwrite=True,
  165. include_optimizer=True,
  166. save_format=None,
  167. signatures=None,
  168. options=None)
  169. print("******* train xdeepfm cost time is: " + str(time.time() - begin_time))