import numpy as np import pandas as pd import gc import os import time from sklearn.preprocessing import LabelEncoder from tensorflow.python.keras.preprocessing.sequence import pad_sequences from deepctr.feature_column import SparseFeat, VarLenSparseFeat, get_feature_names import tensorflow as tf from deepctr.models import * def split(x): key_ans = x.split('|') for key in key_ans: if key not in key2index: # Notice : input value 0 is a special "padding",so we do not use 0 to encode valid feature for sequence input key2index[key] = len(key2index) + 1 return list(map(lambda x: key2index[x], key_ans)) if __name__ == "__main__": begin_time = time.time() data = pd.read_csv("/work/xielixun/dwa_sum_graphembedding_user_action_feature_app_20210225.csv") sparse_features = ["videoid", "mid", "videoGenre1", "videoGenre2", "userRatedVideo1", "userRatedVideo2", "userRatedVideo3", "userGenre1", "userGenre2", "userCity", "authorid", "userRealplayCount", "videoRealPlayCount", "videoDuration"] target = ['label'] feature_max_idx = {} data = data[data["mid"] != "unknown"].copy() data["mid"].replace("unknown", "N000111111D", inplace=True) data = data[data["mid"] != "N000111111D"].copy() # 和上面函数的功能是一样的,见 deepMatch DSSM def add_index_column(param_df, column_name): values = list(param_df[column_name].unique()) value_index_dict = {value: idx for idx, value in enumerate(values)} if column_name == "mid": param_df["uidx"] = param_df[column_name].copy() param_df["mid"] = param_df[column_name].map(value_index_dict) feature_max_idx["mid"] = param_df["mid"].max() + 1 add_index_column(data, "mid") for column_name in sparse_features: lbe = LabelEncoder() print("\n\n-------------- " + column_name) print(data[column_name]) if column_name == "videoGenre1" or column_name == "videoGenre2" or \ column_name == "videoGenre3" or column_name == "userGenre1" or column_name == "userGenre2" or column_name == "userGenre3": data[column_name].fillna("社会", inplace=True) if column_name == "userCity": data[column_name].fillna("北京", inplace=True) if column_name == "mid": continue data[column_name] = lbe.fit_transform(data[column_name]) feature_max_idx[column_name] = data[column_name].max() + 1 key2index = {} print("\n\n ************ data process finish") user_video_list_df = data[data["label"] > 0].copy().groupby("mid")['videoid'].apply(list).reset_index() user_video_list_df.rename(columns={'videoid': 'hist_video_id'}, inplace=True) max_len = 50 # print(list(user_video_length.keys())) # print(list(user_video_length.keys)) mid_list = list(user_video_list_df["mid"]) print(user_video_list_df["mid"]) # print(mid_list) user_video_list_df["hist_len"] = user_video_list_df["hist_video_id"].apply(lambda x: len(x)) print(user_video_list_df) print(len(user_video_list_df)) emb_dim = 10 fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique(), embedding_dim=emb_dim) for feat in sparse_features] print(fixlen_feature_columns) use_weighted_sequence = False if use_weighted_sequence: varlen_feature_columns = [VarLenSparseFeat(SparseFeat('genres', vocabulary_size=len( key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean', weight_name='genres_weight')] # Notice : value 0 is for padding for sequence input feature else: # varlen_feature_columns = [VarLenSparseFeat(SparseFeat('genres', vocabulary_size=len( # key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean', # weight_name=None)] # Notice : value 0 is for padding for sequence input feature varlen_feature_columns = [VarLenSparseFeat(SparseFeat('hist_video_id', vocabulary_size=feature_max_idx["videoid"], embedding_dim=emb_dim, embedding_name="videoid"), maxlen=max_len, combiner='mean', length_name="hist_len")] # Notice : value 0 is for padding for sequence input feature linear_feature_columns = fixlen_feature_columns + varlen_feature_columns dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns print(dnn_feature_columns) feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) df_merge = pd.merge(left=data, right=user_video_list_df, left_on="mid", right_on="mid", how="right") df_merge.head() print("df_merge len is: ", len(df_merge)) df_merge = df_merge.sample(frac=1.0) del data, user_video_list_df gc.collect() print("after sample df_merge len is: ", len(df_merge)) model_input = {name: df_merge[name] for name in sparse_features} # video_hist_seq_pad = pad_sequences(df_merge["hist_video_id"], maxlen=max_len, padding='post', truncating='post', value=0) model_input["hist_video_id"] = video_hist_seq_pad print("\n\n\n") print(video_hist_seq_pad) print("\n\nuser_vids_input len is: ", len(df_merge["hist_video_id"])) # model_input["genres_weight"] = np.random.randn(data.shape[0], max_len, 1) # model_input["hist_len"] = np.array(user_vids_len_input) model_input["hist_len"] = df_merge["hist_len"] print("\n\nuser_vids_len_input len is: ", len(df_merge["hist_len"])) model = FiBiNET(linear_feature_columns, dnn_feature_columns, task='binary') logdir = os.path.join("log_callbacks") # Tensorboard需要一个文件夹 if not os.path.exists(logdir): os.mkdir(logdir) output_model_file = os.path.join(logdir, 'xdeepfm_model.h5') callbacks = [ tf.keras.callbacks.TensorBoard(logdir), tf.keras.callbacks.ModelCheckpoint(output_model_file, save_best_only=True), tf.keras.callbacks.EarlyStopping(patience=5, min_delta=1e-3), ] METRICS = [ tf.keras.metrics.TruePositives(name='tp'), tf.keras.metrics.FalsePositives(name='fp'), tf.keras.metrics.TrueNegatives(name='tn'), tf.keras.metrics.FalseNegatives(name='fn'), tf.keras.metrics.BinaryAccuracy(name='accuracy'), tf.keras.metrics.Precision(name='precision'), tf.keras.metrics.Recall(name='recall'), # tf.keras.metrics.AUC(name='auc'), tf.keras.metrics.AUC(name='auc-ROC', curve='ROC'), tf.keras.metrics.AUC(name='auc-PRC', curve='PR') ] model.compile( loss='binary_crossentropy', optimizer='adam', metrics=METRICS ) model.fit(model_input, df_merge[target].values, batch_size=2048, epochs=5, verbose=2, validation_split=0.2, callbacks=callbacks) model.save("./tensorflow_xdeepfm-0325-tzld-2.h5") tf.keras.models.save_model(model, "file:///work/xielixun/xDeepFM0325/tensorflow_xdeepfm-0325-tzld/001", overwrite=True, include_optimizer=True, save_format=None, signatures=None, options=None) print("******* train FiBiNET cost time is: " + str(time.time() - begin_time))