123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119 |
- # encoding: utf-8
- import tensorflow as tf
- import time
- begin_time = time.time()
- training_samples_file_path = tf.keras.utils.get_file("user_action_app20210103_10day.csv",
- "file:///root/xielixun/user_action_app20210103_10day.csv")
- # load sample as tf dataset
- def get_dataset(file_path):
- dataset = tf.data.experimental.make_csv_dataset(
- file_path,
- batch_size=4096,
- label_name='label',
- na_value="0",
- num_epochs=1,
- ignore_errors=True)
- return dataset
- def get_dataset_test(file_path):
- dataset = tf.data.experimental.make_csv_dataset(
- file_path,
- batch_size=2,
- label_name='label',
- na_value="0",
- num_epochs=1,
- ignore_errors=True)
- return dataset
- train_dataset = get_dataset(training_samples_file_path)
- inputs = {
- 'videoAvgRating': tf.keras.layers.Input(name='videoAvgRating', shape=(), dtype='float32'),
- 'videoRatingStddev': tf.keras.layers.Input(name='videoRatingStddev', shape=(), dtype='float32'),
- 'userAvgRating': tf.keras.layers.Input(name='userAvgRating', shape=(), dtype='float32'),
- 'userRatingStddev': tf.keras.layers.Input(name='userRatingStddev', shape=(), dtype='float32'),
- 'videoid': tf.keras.layers.Input(name='videoid', shape=(), dtype='int32'),
- 'userid': tf.keras.layers.Input(name='userid', shape=(), dtype='int64'),
- 'userRatedVideo1': tf.keras.layers.Input(name='userRatedVideo1', shape=(), dtype='int32'),
- 'userRealplayCount': tf.keras.layers.Input(name='userRealplayCount', shape=(), dtype='int32'),
- 'videoRealPlayCount': tf.keras.layers.Input(name='videoRealPlayCount', shape=(), dtype='int32'),
- 'videoDuration': tf.keras.layers.Input(name='videoDuration', shape=(), dtype='float32')
- }
- # video id embedding feature
- video_col = tf.feature_column.categorical_column_with_identity(key='videoid', num_buckets=7000000)
- video_emb_col = tf.feature_column.embedding_column(video_col, 64)
- video_ind_col = tf.feature_column.indicator_column(video_col) # videoid id indicator columns
- # user id embedding feature
- user_col = tf.feature_column.categorical_column_with_identity(key='userid', num_buckets=1600000)
- user_emb_col = tf.feature_column.embedding_column(user_col, 64)
- user_ind_col = tf.feature_column.indicator_column(user_col) # user id indicator columns
- fm_first_order_columns = [video_ind_col, user_ind_col]
- deep_feature_columns = [
- tf.feature_column.numeric_column('videoAvgRating'),
- tf.feature_column.numeric_column('videoRatingStddev'),
- # tf.feature_column.numeric_column('userRatingCount'),
- tf.feature_column.numeric_column('userAvgRating'),
- tf.feature_column.numeric_column('userRatingStddev'),
- # userRealplayCount,videoRealPlayCount,videoDuration
- tf.feature_column.numeric_column("userRealplayCount"),
- tf.feature_column.numeric_column("videoRealPlayCount"),
- tf.feature_column.numeric_column("videoDuration"),
- video_emb_col,
- user_emb_col]
- item_emb_layer = tf.keras.layers.DenseFeatures([video_emb_col])(inputs)
- user_emb_layer = tf.keras.layers.DenseFeatures([user_emb_col])(inputs)
- # The first-order term in the FM layer
- fm_first_order_layer = tf.keras.layers.DenseFeatures(fm_first_order_columns)(inputs)
- # FM part, cross different categorical feature embeddings
- product_layer_item_user = tf.keras.layers.Dot(axes=1)([item_emb_layer, user_emb_layer])
- # deep part, MLP to generalize all input features
- deep = tf.keras.layers.DenseFeatures(deep_feature_columns)(inputs)
- deep = tf.keras.layers.Dense(64, activation='relu')(deep)
- deep = tf.keras.layers.Dense(64, activation='relu')(deep)
- # concatenate fm part and deep part
- concat_layer = tf.keras.layers.concatenate([fm_first_order_layer, product_layer_item_user, deep], axis=1)
- output_layer = tf.keras.layers.Dense(1, activation='sigmoid')(concat_layer)
- model = tf.keras.Model(inputs, output_layer)
- # compile the model, set loss function, optimizer and evaluation metrics
- model.compile(
- loss='binary_crossentropy',
- optimizer='adam',
- metrics=['accuracy', tf.keras.metrics.AUC(curve='ROC'), tf.keras.metrics.AUC(curve='PR')])
- model.fit(train_dataset, epochs=5)
- import time
- saved_model_path = "./saved_models/{}".format(int(time.time()))
- tf.keras.models.save_model(model,
- "file:///root/xielixun/ctr_sort_model/deepfm_model_10dayApp",
- overwrite=True,
- include_optimizer=True,
- save_format=None,
- signatures=None,
- options=None)
- # evaluate the model
- test_loss, test_accuracy, test_roc_auc, test_pr_auc = model.evaluate(train_dataset)
- print('\n\nTest Loss {}, Test Accuracy {}, Test ROC AUC {}, Test PR AUC {}'.format(test_loss, test_accuracy,
- test_roc_auc, test_pr_auc))
- print("deepFM training cost time is: " + str(time.time() - begin_time))
|