# encoding: utf-8 import tensorflow as tf import time begin_time = time.time() training_samples_file_path = tf.keras.utils.get_file("user_action_app20210103_10day.csv", "file:///root/xielixun/user_action_app20210103_10day.csv") # load sample as tf dataset def get_dataset(file_path): dataset = tf.data.experimental.make_csv_dataset( file_path, batch_size=4096, label_name='label', na_value="0", num_epochs=1, ignore_errors=True) return dataset def get_dataset_test(file_path): dataset = tf.data.experimental.make_csv_dataset( file_path, batch_size=2, label_name='label', na_value="0", num_epochs=1, ignore_errors=True) return dataset train_dataset = get_dataset(training_samples_file_path) inputs = { 'videoAvgRating': tf.keras.layers.Input(name='videoAvgRating', shape=(), dtype='float32'), 'videoRatingStddev': tf.keras.layers.Input(name='videoRatingStddev', shape=(), dtype='float32'), 'userAvgRating': tf.keras.layers.Input(name='userAvgRating', shape=(), dtype='float32'), 'userRatingStddev': tf.keras.layers.Input(name='userRatingStddev', shape=(), dtype='float32'), 'videoid': tf.keras.layers.Input(name='videoid', shape=(), dtype='int32'), 'userid': tf.keras.layers.Input(name='userid', shape=(), dtype='int64'), 'userRatedVideo1': tf.keras.layers.Input(name='userRatedVideo1', shape=(), dtype='int32'), 'userRealplayCount': tf.keras.layers.Input(name='userRealplayCount', shape=(), dtype='int32'), 'videoRealPlayCount': tf.keras.layers.Input(name='videoRealPlayCount', shape=(), dtype='int32'), 'videoDuration': tf.keras.layers.Input(name='videoDuration', shape=(), dtype='float32') } # video id embedding feature video_col = tf.feature_column.categorical_column_with_identity(key='videoid', num_buckets=7000000) video_emb_col = tf.feature_column.embedding_column(video_col, 64) video_ind_col = tf.feature_column.indicator_column(video_col) # videoid id indicator columns # user id embedding feature user_col = tf.feature_column.categorical_column_with_identity(key='userid', num_buckets=1600000) user_emb_col = tf.feature_column.embedding_column(user_col, 64) user_ind_col = tf.feature_column.indicator_column(user_col) # user id indicator columns fm_first_order_columns = [video_ind_col, user_ind_col] deep_feature_columns = [ tf.feature_column.numeric_column('videoAvgRating'), tf.feature_column.numeric_column('videoRatingStddev'), # tf.feature_column.numeric_column('userRatingCount'), tf.feature_column.numeric_column('userAvgRating'), tf.feature_column.numeric_column('userRatingStddev'), # userRealplayCount,videoRealPlayCount,videoDuration tf.feature_column.numeric_column("userRealplayCount"), tf.feature_column.numeric_column("videoRealPlayCount"), tf.feature_column.numeric_column("videoDuration"), video_emb_col, user_emb_col] item_emb_layer = tf.keras.layers.DenseFeatures([video_emb_col])(inputs) user_emb_layer = tf.keras.layers.DenseFeatures([user_emb_col])(inputs) # The first-order term in the FM layer fm_first_order_layer = tf.keras.layers.DenseFeatures(fm_first_order_columns)(inputs) # FM part, cross different categorical feature embeddings product_layer_item_user = tf.keras.layers.Dot(axes=1)([item_emb_layer, user_emb_layer]) # deep part, MLP to generalize all input features deep = tf.keras.layers.DenseFeatures(deep_feature_columns)(inputs) deep = tf.keras.layers.Dense(64, activation='relu')(deep) deep = tf.keras.layers.Dense(64, activation='relu')(deep) # concatenate fm part and deep part concat_layer = tf.keras.layers.concatenate([fm_first_order_layer, product_layer_item_user, deep], axis=1) output_layer = tf.keras.layers.Dense(1, activation='sigmoid')(concat_layer) model = tf.keras.Model(inputs, output_layer) # compile the model, set loss function, optimizer and evaluation metrics model.compile( loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', tf.keras.metrics.AUC(curve='ROC'), tf.keras.metrics.AUC(curve='PR')]) model.fit(train_dataset, epochs=5) import time saved_model_path = "./saved_models/{}".format(int(time.time())) tf.keras.models.save_model(model, "file:///root/xielixun/ctr_sort_model/deepfm_model_10dayApp", overwrite=True, include_optimizer=True, save_format=None, signatures=None, options=None) # evaluate the model test_loss, test_accuracy, test_roc_auc, test_pr_auc = model.evaluate(train_dataset) print('\n\nTest Loss {}, Test Accuracy {}, Test ROC AUC {}, Test PR AUC {}'.format(test_loss, test_accuracy, test_roc_auc, test_pr_auc)) print("deepFM training cost time is: " + str(time.time() - begin_time))