|
@@ -0,0 +1,219 @@
|
|
|
+import numpy as np
|
|
|
+import pandas as pd
|
|
|
+import gc
|
|
|
+import os
|
|
|
+import time
|
|
|
+from sklearn.preprocessing import LabelEncoder
|
|
|
+from tensorflow.python.keras.preprocessing.sequence import pad_sequences
|
|
|
+
|
|
|
+from deepctr.feature_column import SparseFeat, VarLenSparseFeat, get_feature_names
|
|
|
+
|
|
|
+import tensorflow as tf
|
|
|
+from deepctr.models import xDeepFM
|
|
|
+
|
|
|
+
|
|
|
+def split(x):
|
|
|
+ key_ans = x.split('|')
|
|
|
+ for key in key_ans:
|
|
|
+ if key not in key2index:
|
|
|
+ # Notice : input value 0 is a special "padding",so we do not use 0 to encode valid feature for sequence input
|
|
|
+ key2index[key] = len(key2index) + 1
|
|
|
+ return list(map(lambda x: key2index[x], key_ans))
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+
|
|
|
+ begin_time = time.time()
|
|
|
+ data = pd.read_csv("/work/xielixun/dwa_sum_graphembedding_user_action_feature_app_20210225.csv")
|
|
|
+ sparse_features = ["videoid", "mid",
|
|
|
+ "videoGenre1", "videoGenre2", "userRatedVideo1", "userRatedVideo2", "userRatedVideo3", "userGenre1", "userGenre2", "userCity",
|
|
|
+ "authorid", "userRealplayCount", "videoRealPlayCount", "videoDuration"]
|
|
|
+
|
|
|
+ # target = ['rating']
|
|
|
+ target = ['label']
|
|
|
+
|
|
|
+ feature_max_idx = {}
|
|
|
+
|
|
|
+ data = data[data["mid"] != "unknown"].copy()
|
|
|
+ data["mid"].replace("unknown", "N000111111D", inplace=True)
|
|
|
+ data = data[data["mid"] != "N000111111D"].copy()
|
|
|
+
|
|
|
+ # 和上面函数的功能是一样的,见 deepMatch DSSM
|
|
|
+ def add_index_column(param_df, column_name):
|
|
|
+ values = list(param_df[column_name].unique())
|
|
|
+ value_index_dict = {value: idx for idx, value in enumerate(values)}
|
|
|
+ if column_name == "mid":
|
|
|
+ param_df["uidx"] = param_df[column_name].copy()
|
|
|
+ param_df["mid"] = param_df[column_name].map(value_index_dict)
|
|
|
+ feature_max_idx["mid"] = param_df["mid"].max() + 1
|
|
|
+
|
|
|
+ add_index_column(data, "mid")
|
|
|
+ feature_max_idx["videoid"] = data["videoid"].max() + 1
|
|
|
+
|
|
|
+ for column_name in sparse_features:
|
|
|
+ lbe = LabelEncoder()
|
|
|
+ print("\n\n-------------- " + column_name)
|
|
|
+ print(data[column_name])
|
|
|
+
|
|
|
+ if column_name == "videoGenre1" or column_name == "videoGenre2" or \
|
|
|
+ column_name == "videoGenre3" or column_name == "userGenre1" or column_name == "userGenre2" or column_name == "userGenre3":
|
|
|
+ data[column_name].fillna("社会", inplace=True)
|
|
|
+
|
|
|
+ if column_name == "userCity":
|
|
|
+ data[column_name].fillna("北京", inplace=True)
|
|
|
+
|
|
|
+ if column_name == "mid":
|
|
|
+ continue
|
|
|
+ if column_name == "videoid":
|
|
|
+ continue
|
|
|
+
|
|
|
+ data[column_name] = lbe.fit_transform(data[column_name])
|
|
|
+ feature_max_idx[column_name] = data[column_name].max() + 1
|
|
|
+
|
|
|
+ key2index = {}
|
|
|
+
|
|
|
+ print("\n\n ************ data process finish")
|
|
|
+ user_video_list_df = data[data["label"] > 0].copy().groupby("mid")['videoid'].apply(list).reset_index()
|
|
|
+ user_video_list_df.rename(columns={'videoid': 'hist_video_id'}, inplace=True)
|
|
|
+
|
|
|
+ print(user_video_list_df)
|
|
|
+ print(type(user_video_list_df))
|
|
|
+
|
|
|
+ max_len = 50
|
|
|
+ # print(list(user_video_length.keys()))
|
|
|
+ # print(list(user_video_length.keys))
|
|
|
+ mid_list = list(user_video_list_df["mid"])
|
|
|
+ print(user_video_list_df["mid"])
|
|
|
+ user_video_list_df["hist_len"] = user_video_list_df["hist_video_id"].apply(lambda x: len(x))
|
|
|
+ print(user_video_list_df)
|
|
|
+ print(len(user_video_list_df))
|
|
|
+ emb_dim = 10
|
|
|
+
|
|
|
+ fixlen_feature_columns = [SparseFeat(feat, feature_max_idx[feat], embedding_dim=emb_dim)
|
|
|
+ for feat in sparse_features]
|
|
|
+
|
|
|
+ print(fixlen_feature_columns)
|
|
|
+
|
|
|
+ use_weighted_sequence = False
|
|
|
+ if use_weighted_sequence:
|
|
|
+ varlen_feature_columns = [VarLenSparseFeat(SparseFeat('genres', vocabulary_size=len(
|
|
|
+ key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean',
|
|
|
+ weight_name='genres_weight')] # Notice : value 0 is for padding for sequence input feature
|
|
|
+ else:
|
|
|
+ # varlen_feature_columns = [VarLenSparseFeat(SparseFeat('genres', vocabulary_size=len(
|
|
|
+ # key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean',
|
|
|
+ # weight_name=None)] # Notice : value 0 is for padding for sequence input feature
|
|
|
+ varlen_feature_columns = [VarLenSparseFeat(SparseFeat('hist_video_id', vocabulary_size=feature_max_idx["videoid"], embedding_dim=emb_dim,
|
|
|
+ embedding_name="videoid"), maxlen=max_len, combiner='mean',
|
|
|
+ length_name="hist_len")] # Notice : value 0 is for padding for sequence input feature
|
|
|
+
|
|
|
+ print(varlen_feature_columns)
|
|
|
+ #linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
|
|
|
+ linear_feature_columns = fixlen_feature_columns
|
|
|
+ dnn_feature_columns = fixlen_feature_columns #+ varlen_feature_columns
|
|
|
+ print(dnn_feature_columns)
|
|
|
+ feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
|
|
|
+
|
|
|
+ print("data len is: ", len(data))
|
|
|
+
|
|
|
+ df_merge = pd.merge(left=data,
|
|
|
+ right=user_video_list_df,
|
|
|
+ left_on="mid",
|
|
|
+ right_on="mid",
|
|
|
+ #how="right")
|
|
|
+ how="inner")
|
|
|
+ df_merge.head()
|
|
|
+
|
|
|
+ print("df_merge len is: ", len(df_merge))
|
|
|
+
|
|
|
+ df_merge = df_merge.sample(frac=1.0)
|
|
|
+ #df_merge = df_merge.sample(frac=0.3)
|
|
|
+ del data, user_video_list_df
|
|
|
+ gc.collect()
|
|
|
+
|
|
|
+ print("after sample df_merge len is: ", len(df_merge))
|
|
|
+
|
|
|
+ model_input = {name: df_merge[name] for name in sparse_features} #
|
|
|
+
|
|
|
+ # df_merge["hist_video_id"].fillna(str([0, 0, 0]), inplace=True)
|
|
|
+ # df_merge["hist_len"].fillna(0, inplace=True)
|
|
|
+
|
|
|
+ print(df_merge)
|
|
|
+ video_hist_seq_pad = pad_sequences(df_merge["hist_video_id"], maxlen=max_len, padding='post', truncating='post', value=0)
|
|
|
+
|
|
|
+ model_input["hist_video_id"] = video_hist_seq_pad
|
|
|
+ print("\n\n\n")
|
|
|
+ print(video_hist_seq_pad)
|
|
|
+ print("\n\nuser_vids_input len is: ", len(df_merge["hist_video_id"]))
|
|
|
+
|
|
|
+ model_input["hist_len"] = df_merge["hist_len"]
|
|
|
+
|
|
|
+ print("\n\nuser_vids_len_input len is: ", len(df_merge["hist_len"]))
|
|
|
+ user_features = ["uidx", "mid",
|
|
|
+ "userRatedVideo1", "userRatedVideo2", "userRatedVideo3",
|
|
|
+ "userGenre1", "userGenre2", "userCity",
|
|
|
+ "userRealplayCount", "hist_video_id", "hist_len"]
|
|
|
+
|
|
|
+ item_features = ["videoid", "videoGenre1", "videoGenre2",
|
|
|
+ "authorid", "videoRealPlayCount", "videoDuration"]
|
|
|
+
|
|
|
+ df_merge["hist_video_id"] = list(map(lambda x: ' '.join(list(map(str, x))), video_hist_seq_pad)) # 上下两条功能相同
|
|
|
+
|
|
|
+ user_df = df_merge[user_features].drop_duplicates('uidx')
|
|
|
+ video_df = df_merge[item_features].drop_duplicates('videoid')
|
|
|
+
|
|
|
+ from datetime import datetime
|
|
|
+
|
|
|
+ TIMESTAMP = "{0:%Y-%m-%dT%H-%M-%S}".format(datetime.now())
|
|
|
+ user_df.to_csv("user_df" + TIMESTAMP + ".csv", index=False)
|
|
|
+ video_df.to_csv("video_df" + TIMESTAMP + ".csv", index=False)
|
|
|
+
|
|
|
+ model = xDeepFM(linear_feature_columns, dnn_feature_columns, task='binary')
|
|
|
+
|
|
|
+ logdir = os.path.join("log_callbacks") # Tensorboard需要一个文件夹
|
|
|
+ if not os.path.exists(logdir):
|
|
|
+ os.mkdir(logdir)
|
|
|
+ output_model_file = os.path.join(logdir,
|
|
|
+ 'xdeepfm_model.h5')
|
|
|
+
|
|
|
+ callbacks = [
|
|
|
+ tf.keras.callbacks.TensorBoard(logdir),
|
|
|
+ tf.keras.callbacks.ModelCheckpoint(output_model_file,
|
|
|
+ save_best_only=True),
|
|
|
+ tf.keras.callbacks.EarlyStopping(patience=5, min_delta=1e-3),
|
|
|
+ ]
|
|
|
+
|
|
|
+ METRICS = [
|
|
|
+ tf.keras.metrics.TruePositives(name='tp'),
|
|
|
+ tf.keras.metrics.FalsePositives(name='fp'),
|
|
|
+ tf.keras.metrics.TrueNegatives(name='tn'),
|
|
|
+ tf.keras.metrics.FalseNegatives(name='fn'),
|
|
|
+ tf.keras.metrics.BinaryAccuracy(name='accuracy'),
|
|
|
+ tf.keras.metrics.Precision(name='precision'),
|
|
|
+ tf.keras.metrics.Recall(name='recall'),
|
|
|
+ # tf.keras.metrics.AUC(name='auc'),
|
|
|
+ tf.keras.metrics.AUC(name='auc-ROC', curve='ROC'),
|
|
|
+ tf.keras.metrics.AUC(name='auc-PRC', curve='PR')
|
|
|
+ ]
|
|
|
+
|
|
|
+ model.compile(
|
|
|
+ loss='binary_crossentropy',
|
|
|
+ optimizer='adam',
|
|
|
+ metrics=METRICS
|
|
|
+ )
|
|
|
+
|
|
|
+ model.fit(model_input, df_merge[target].values,
|
|
|
+ batch_size=4096, epochs=2,
|
|
|
+ verbose=2, validation_split=0.2, callbacks=callbacks)
|
|
|
+
|
|
|
+ model.save("./tensorflow_xdeepfm-0325-tzld-2.h5")
|
|
|
+ tf.keras.models.save_model(model,
|
|
|
+ "file:///work/xielixun/xDeepFM0325/tensorflow_xdeepfm-0325-tzld/001",
|
|
|
+ overwrite=True,
|
|
|
+ include_optimizer=True,
|
|
|
+ save_format=None,
|
|
|
+ signatures=None,
|
|
|
+ options=None)
|
|
|
+
|
|
|
+ print("******* train xdeepfm cost time is: " + str(time.time() - begin_time))
|
|
|
+
|