# encoding: utf-8 import tensorflow as tf import numpy as np import pandas as pd import gc import os import time from sklearn.preprocessing import LabelEncoder from tensorflow.python.keras.preprocessing.sequence import pad_sequences begin_time = time.time() data = pd.read_csv("/work/xielixun/dwa_sum_graphembedding_user_action_feature_app_20210225.csv") sparse_features = ["videoid", "mid", "videoGenre1", "videoGenre2", "userRatedVideo1", "userRatedVideo2", "userRatedVideo3", "userGenre1", "userGenre2", "userCity", "authorid", "userRealplayCount", "videoRealPlayCount", "videoDuration"] target = ['label'] data = data.sort_values(by='logtimestamp', ascending=False) # 需要把生成的label id 都保留下来,供后面线上使用,拼接特征 feature_max_idx = {} data = data[data["mid"] != "unknown"].copy() data["mid"].replace("unknown", "N000111111D", inplace=True) data = data[data["mid"] != "N000111111D"].copy() # 和上面函数的功能是一样的,见 deepMatch DSSM def add_index_column(param_df, column_name): values = list(param_df[column_name].unique()) value_index_dict = {value: idx for idx, value in enumerate(values)} if column_name == "mid": param_df["uidx"] = param_df[column_name].copy() param_df["mid"] = param_df[column_name].map(value_index_dict) + 1 feature_max_idx["mid"] = param_df["mid"].max() + 1 add_index_column(data, "mid") for column_name in sparse_features: lbe = LabelEncoder() print("\n\n-------------- " + column_name) print(data[column_name]) if column_name == "videoGenre1" or column_name == "videoGenre2" or \ column_name == "videoGenre3" or column_name == "userGenre1" or column_name == "userGenre2" or column_name == "userGenre3": data[column_name].fillna("社会", inplace=True) continue if column_name == "userCity": data[column_name].fillna("北京", inplace=True) if column_name == "mid": continue if column_name == "videoid": data["vidx"] = data[column_name].copy() data["videoid"] = lbe.fit_transform(data[column_name]) + 1 feature_max_idx["videoid"] = data["videoid"].max() + 1 # print(data["videoid"]) else: data[column_name] = lbe.fit_transform(data[column_name]) feature_max_idx[column_name] = data[column_name].max() + 1 key2index = {} print("\n\n ************ data process finish") user_video_list_df = data[data["label"] > 0].copy().groupby("mid")['videoid'].apply(list).reset_index() user_video_list_df.rename(columns={'videoid': 'hist_video_id'}, inplace=True) max_len = 50 mid_list = list(user_video_list_df["mid"]) user_video_list_df["hist_len"] = user_video_list_df["hist_video_id"].apply(lambda x: len(x)) emb_dim = 10 df_merge = pd.merge(left=data, right=user_video_list_df, left_on="mid", right_on="mid", how="right") df_merge.head() df_merge = df_merge.sample(frac=1.0) g1 = data["videoGenre1"].values g2 = data["videoGenre2"].values g4 = data["userGenre1"].values g5 = data["userGenre2"].values del data, user_video_list_df gc.collect() model_input = {name: df_merge[name] for name in sparse_features} video_hist_seq_pad = pad_sequences(df_merge["hist_video_id"], maxlen=max_len, padding='post', truncating='post', value=0) model_input["hist_video_id"] = video_hist_seq_pad model_input["hist_len"] = df_merge["hist_len"] genre_set = g1 + g2 + g4 + g5 genre_vocab = list(set(genre_set)) GENRE_FEATURES = { 'userGenre1': genre_vocab, 'userGenre2': genre_vocab, 'videoGenre1': genre_vocab, 'videoGenre2': genre_vocab, } emb_dim1 = 10 emb_dim2 = 32 categorical_columns = [] for feature, vocab in GENRE_FEATURES.items(): cat_col = tf.feature_column.categorical_column_with_vocabulary_list( key=feature, vocabulary_list=vocab) emb_col = tf.feature_column.embedding_column(cat_col, emb_dim1) categorical_columns.append(emb_col) # video id embedding feature video_col = tf.feature_column.categorical_column_with_identity(key='videoid', num_buckets=feature_max_idx["videoid"]) video_emb_col = tf.feature_column.embedding_column(video_col, emb_dim2) categorical_columns.append(video_emb_col) # user id embedding feature user_col = tf.feature_column.categorical_column_with_identity(key='mid', num_buckets=feature_max_idx["mid"]) # user_emb_col = tf.feature_column.embedding_column(user_col, 10) user_emb_col = tf.feature_column.embedding_column(user_col, emb_dim2) categorical_columns.append(user_emb_col) user_city_col = tf.feature_column.categorical_column_with_identity(key='userCity', num_buckets=feature_max_idx["userCity"]) user_city_emb_col = tf.feature_column.embedding_column(user_city_col, emb_dim1) categorical_columns.append(user_city_emb_col) authorid_col = tf.feature_column.categorical_column_with_identity(key='authorid', num_buckets=feature_max_idx["authorid"]) authorid_emb_col = tf.feature_column.embedding_column(authorid_col, emb_dim2) categorical_columns.append(authorid_emb_col) numerical_columns = [ tf.feature_column.numeric_column("userRealplayCount"), tf.feature_column.numeric_column("videoRealPlayCount"), tf.feature_column.numeric_column("videoDuration") ] # cross feature between current video and user historical video rated_video1 = tf.feature_column.categorical_column_with_identity(key='userRatedVideo1', num_buckets=feature_max_idx["videoid"]) rated_video2 = tf.feature_column.categorical_column_with_identity(key='userRatedVideo2', num_buckets=feature_max_idx["videoid"]) rated_video3 = tf.feature_column.categorical_column_with_identity(key='userRatedVideo3', num_buckets=feature_max_idx["videoid"]) crossed_feature = tf.feature_column.indicator_column(tf.feature_column.crossed_column([video_col, rated_video1, rated_video2, rated_video3], feature_max_idx["videoid"])) user_features = ["uidx", "mid", "userRatedVideo1", "userRatedVideo2", "userRatedVideo3", "userGenre1", "userGenre2", "userCity", "userRealplayCount", "hist_video_id", "hist_len"] item_features = ["vidx","videoid", "videoGenre1", "videoGenre2", "authorid", "videoRealPlayCount", "videoDuration"] df_merge["hist_video_id"] = list(map(lambda x: ' '.join(list(map(str, x))), video_hist_seq_pad)) # 上下两条功能相同 user_df = df_merge[user_features].drop_duplicates('uidx') video_df = df_merge[item_features].drop_duplicates('vidx') df_target = df_merge[target].copy() del df_merge gc.collect() from datetime import datetime TIMESTAMP = "{0:%Y-%m-%dT%H-%M-%S}".format(datetime.now()) user_df.to_csv("user_df" + TIMESTAMP + ".csv", index=False) video_df.to_csv("video_df" + TIMESTAMP + ".csv", index=False) del user_df, video_df gc.collect() inputs = { 'videoid': tf.keras.layers.Input(name='videoid', shape=(), dtype='int32'), 'mid': tf.keras.layers.Input(name='mid', shape=(), dtype='int32'), 'userRatedVideo1': tf.keras.layers.Input(name='userRatedVideo1', shape=(), dtype='int32'), 'userRatedVideo2': tf.keras.layers.Input(name='userRatedVideo2', shape=(), dtype='int32'), 'userRatedVideo3': tf.keras.layers.Input(name='userRatedVideo3', shape=(), dtype='int32'), 'userRealplayCount': tf.keras.layers.Input(name='userRealplayCount', shape=(), dtype='int32'), 'videoRealPlayCount': tf.keras.layers.Input(name='videoRealPlayCount', shape=(), dtype='int32'), 'videoDuration': tf.keras.layers.Input(name='videoDuration', shape=(), dtype='int32'), "authorid": tf.keras.layers.Input(name='authorid', shape=(), dtype='int32'), "userCity": tf.keras.layers.Input(name='userCity', shape=(), dtype='int32'), 'userGenre1': tf.keras.layers.Input(name='userGenre1', shape=(), dtype='string'), 'userGenre2': tf.keras.layers.Input(name='userGenre2', shape=(), dtype='string'), 'videoGenre1': tf.keras.layers.Input(name='videoGenre1', shape=(), dtype='string'), 'videoGenre2': tf.keras.layers.Input(name='videoGenre2', shape=(), dtype='string'), 'hist_video_id':tf.keras.Input(shape=(50,), name='hist_video_id', dtype='int32'), 'hist_len': tf.keras.Input(shape=(), name='hist_len', dtype='int32'), } from tensorflow.python.feature_column.feature_column import _LazyBuilder hist_video_column = tf.feature_column.categorical_column_with_hash_bucket('hist_video_id', 2 * feature_max_idx["videoid"], dtype=tf.int32) hist_video_embedded = tf.feature_column.embedding_column(hist_video_column, emb_dim2) categorical_columns.append(hist_video_embedded) deep = tf.keras.layers.DenseFeatures(numerical_columns + categorical_columns)(inputs) deep = tf.keras.layers.Dense(128, activation='relu')(deep) deep = tf.keras.layers.Dense(64, activation='relu')(deep) # wide part for cross feature wide = tf.keras.layers.DenseFeatures(crossed_feature)(inputs) both = tf.keras.layers.concatenate([deep, wide]) output_layer = tf.keras.layers.Dense(1, activation='sigmoid')(both) model = tf.keras.Model(inputs, output_layer) logdir = os.path.join("log_callbacks") # Tensorboard需要一个文件夹 if not os.path.exists(logdir): os.mkdir(logdir) output_model_file = os.path.join(logdir, 'wdl_model.h5') callbacks = [ tf.keras.callbacks.TensorBoard(logdir), tf.keras.callbacks.ModelCheckpoint(output_model_file, save_best_only=True), tf.keras.callbacks.EarlyStopping(patience=5, min_delta=1e-3), ] METRICS = [ tf.keras.metrics.TruePositives(name='tp'), tf.keras.metrics.FalsePositives(name='fp'), tf.keras.metrics.TrueNegatives(name='tn'), tf.keras.metrics.FalseNegatives(name='fn'), tf.keras.metrics.BinaryAccuracy(name='accuracy'), tf.keras.metrics.Precision(name='precision'), tf.keras.metrics.Recall(name='recall'), tf.keras.metrics.AUC(name='auc'), tf.keras.metrics.AUC(name='auc-ROC', curve='ROC'), tf.keras.metrics.AUC(name='auc-PRC', curve='PR') ] model.compile( loss='binary_crossentropy', optimizer='adam', metrics=METRICS ) model.fit(model_input, df_target, batch_size=4096, epochs=1, verbose=2, validation_split=0.2, callbacks=callbacks) model.save("./tensorflow_WDL-0416-tzld-2.h5") tf.keras.models.save_model(model, "file:///work/xielixun/WDL_0416/tensorflow_WDL-0416-tzld/001", # "file:///Users/xielixun/Desktop/lixunxie/python_code/DeepMatch/examples/WDL_0416/tensorflow_WDL-0416-tzld/001", overwrite=True, include_optimizer=True, save_format=None, signatures=None, options=None) print("trainning wDL in 2021-0415 cost time: " + str(time.time() - begin_time))