123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181 |
- import numpy as np
- import pandas as pd
- import gc
- import os
- import time
- from sklearn.preprocessing import LabelEncoder
- from tensorflow.python.keras.preprocessing.sequence import pad_sequences
- from deepctr.feature_column import SparseFeat, VarLenSparseFeat, get_feature_names
- import tensorflow as tf
- from deepctr.models import *
- def split(x):
- key_ans = x.split('|')
- for key in key_ans:
- if key not in key2index:
- # Notice : input value 0 is a special "padding",so we do not use 0 to encode valid feature for sequence input
- key2index[key] = len(key2index) + 1
- return list(map(lambda x: key2index[x], key_ans))
- if __name__ == "__main__":
- begin_time = time.time()
- data = pd.read_csv("/work/xielixun/dwa_sum_graphembedding_user_action_feature_app_20210225.csv")
- sparse_features = ["videoid", "mid",
- "videoGenre1", "videoGenre2", "userRatedVideo1", "userRatedVideo2", "userRatedVideo3", "userGenre1", "userGenre2", "userCity",
- "authorid", "userRealplayCount", "videoRealPlayCount", "videoDuration"]
- target = ['label']
- feature_max_idx = {}
- data = data[data["mid"] != "unknown"].copy()
- data["mid"].replace("unknown", "N000111111D", inplace=True)
- data = data[data["mid"] != "N000111111D"].copy()
- # 和上面函数的功能是一样的,见 deepMatch DSSM
- def add_index_column(param_df, column_name):
- values = list(param_df[column_name].unique())
- value_index_dict = {value: idx for idx, value in enumerate(values)}
- if column_name == "mid":
- param_df["uidx"] = param_df[column_name].copy()
- param_df["mid"] = param_df[column_name].map(value_index_dict)
- feature_max_idx["mid"] = param_df["mid"].max() + 1
- add_index_column(data, "mid")
- for column_name in sparse_features:
- lbe = LabelEncoder()
- print("\n\n-------------- " + column_name)
- print(data[column_name])
- if column_name == "videoGenre1" or column_name == "videoGenre2" or \
- column_name == "videoGenre3" or column_name == "userGenre1" or column_name == "userGenre2" or column_name == "userGenre3":
- data[column_name].fillna("社会", inplace=True)
- if column_name == "userCity":
- data[column_name].fillna("北京", inplace=True)
- if column_name == "mid":
- continue
- data[column_name] = lbe.fit_transform(data[column_name])
- feature_max_idx[column_name] = data[column_name].max() + 1
- key2index = {}
- print("\n\n ************ data process finish")
- user_video_list_df = data[data["label"] > 0].copy().groupby("mid")['videoid'].apply(list).reset_index()
- user_video_list_df.rename(columns={'videoid': 'hist_video_id'}, inplace=True)
- max_len = 50
- # print(list(user_video_length.keys()))
- # print(list(user_video_length.keys))
- mid_list = list(user_video_list_df["mid"])
- print(user_video_list_df["mid"])
- # print(mid_list)
- user_video_list_df["hist_len"] = user_video_list_df["hist_video_id"].apply(lambda x: len(x))
- print(user_video_list_df)
- print(len(user_video_list_df))
- emb_dim = 10
- fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique(), embedding_dim=emb_dim)
- for feat in sparse_features]
- print(fixlen_feature_columns)
- use_weighted_sequence = False
- if use_weighted_sequence:
- varlen_feature_columns = [VarLenSparseFeat(SparseFeat('genres', vocabulary_size=len(
- key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean',
- weight_name='genres_weight')] # Notice : value 0 is for padding for sequence input feature
- else:
- # varlen_feature_columns = [VarLenSparseFeat(SparseFeat('genres', vocabulary_size=len(
- # key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean',
- # weight_name=None)] # Notice : value 0 is for padding for sequence input feature
- varlen_feature_columns = [VarLenSparseFeat(SparseFeat('hist_video_id', vocabulary_size=feature_max_idx["videoid"], embedding_dim=emb_dim,
- embedding_name="videoid"), maxlen=max_len, combiner='mean',
- length_name="hist_len")] # Notice : value 0 is for padding for sequence input feature
- linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
- dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns
- print(dnn_feature_columns)
- feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
- df_merge = pd.merge(left=data,
- right=user_video_list_df,
- left_on="mid",
- right_on="mid",
- how="right")
- df_merge.head()
- print("df_merge len is: ", len(df_merge))
- df_merge = df_merge.sample(frac=1.0)
- del data, user_video_list_df
- gc.collect()
- print("after sample df_merge len is: ", len(df_merge))
- model_input = {name: df_merge[name] for name in sparse_features} #
- video_hist_seq_pad = pad_sequences(df_merge["hist_video_id"], maxlen=max_len, padding='post', truncating='post', value=0)
- model_input["hist_video_id"] = video_hist_seq_pad
- print("\n\n\n")
- print(video_hist_seq_pad)
- print("\n\nuser_vids_input len is: ", len(df_merge["hist_video_id"]))
- # model_input["genres_weight"] = np.random.randn(data.shape[0], max_len, 1)
- # model_input["hist_len"] = np.array(user_vids_len_input)
- model_input["hist_len"] = df_merge["hist_len"]
- print("\n\nuser_vids_len_input len is: ", len(df_merge["hist_len"]))
- model = FiBiNET(linear_feature_columns, dnn_feature_columns, task='binary')
- logdir = os.path.join("log_callbacks") # Tensorboard需要一个文件夹
- if not os.path.exists(logdir):
- os.mkdir(logdir)
- output_model_file = os.path.join(logdir,
- 'xdeepfm_model.h5')
- callbacks = [
- tf.keras.callbacks.TensorBoard(logdir),
- tf.keras.callbacks.ModelCheckpoint(output_model_file,
- save_best_only=True),
- tf.keras.callbacks.EarlyStopping(patience=5, min_delta=1e-3),
- ]
- METRICS = [
- tf.keras.metrics.TruePositives(name='tp'),
- tf.keras.metrics.FalsePositives(name='fp'),
- tf.keras.metrics.TrueNegatives(name='tn'),
- tf.keras.metrics.FalseNegatives(name='fn'),
- tf.keras.metrics.BinaryAccuracy(name='accuracy'),
- tf.keras.metrics.Precision(name='precision'),
- tf.keras.metrics.Recall(name='recall'),
- # tf.keras.metrics.AUC(name='auc'),
- tf.keras.metrics.AUC(name='auc-ROC', curve='ROC'),
- tf.keras.metrics.AUC(name='auc-PRC', curve='PR')
- ]
- model.compile(
- loss='binary_crossentropy',
- optimizer='adam',
- metrics=METRICS
- )
- model.fit(model_input, df_merge[target].values,
- batch_size=2048, epochs=5,
- verbose=2, validation_split=0.2, callbacks=callbacks)
- model.save("./tensorflow_xdeepfm-0325-tzld-2.h5")
- tf.keras.models.save_model(model,
- "file:///work/xielixun/xDeepFM0325/tensorflow_xdeepfm-0325-tzld/001",
- overwrite=True,
- include_optimizer=True,
- save_format=None,
- signatures=None,
- options=None)
- print("******* train FiBiNET cost time is: " + str(time.time() - begin_time))
|