DeepFM_tzld_rank.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119
  1. # encoding: utf-8
  2. import tensorflow as tf
  3. import time
  4. begin_time = time.time()
  5. training_samples_file_path = tf.keras.utils.get_file("user_action_app20210103_10day.csv",
  6. "file:///root/xielixun/user_action_app20210103_10day.csv")
  7. # load sample as tf dataset
  8. def get_dataset(file_path):
  9. dataset = tf.data.experimental.make_csv_dataset(
  10. file_path,
  11. batch_size=4096,
  12. label_name='label',
  13. na_value="0",
  14. num_epochs=1,
  15. ignore_errors=True)
  16. return dataset
  17. def get_dataset_test(file_path):
  18. dataset = tf.data.experimental.make_csv_dataset(
  19. file_path,
  20. batch_size=2,
  21. label_name='label',
  22. na_value="0",
  23. num_epochs=1,
  24. ignore_errors=True)
  25. return dataset
  26. train_dataset = get_dataset(training_samples_file_path)
  27. inputs = {
  28. 'videoAvgRating': tf.keras.layers.Input(name='videoAvgRating', shape=(), dtype='float32'),
  29. 'videoRatingStddev': tf.keras.layers.Input(name='videoRatingStddev', shape=(), dtype='float32'),
  30. 'userAvgRating': tf.keras.layers.Input(name='userAvgRating', shape=(), dtype='float32'),
  31. 'userRatingStddev': tf.keras.layers.Input(name='userRatingStddev', shape=(), dtype='float32'),
  32. 'videoid': tf.keras.layers.Input(name='videoid', shape=(), dtype='int32'),
  33. 'userid': tf.keras.layers.Input(name='userid', shape=(), dtype='int64'),
  34. 'userRatedVideo1': tf.keras.layers.Input(name='userRatedVideo1', shape=(), dtype='int32'),
  35. 'userRealplayCount': tf.keras.layers.Input(name='userRealplayCount', shape=(), dtype='int32'),
  36. 'videoRealPlayCount': tf.keras.layers.Input(name='videoRealPlayCount', shape=(), dtype='int32'),
  37. 'videoDuration': tf.keras.layers.Input(name='videoDuration', shape=(), dtype='float32')
  38. }
  39. # video id embedding feature
  40. video_col = tf.feature_column.categorical_column_with_identity(key='videoid', num_buckets=7000000)
  41. video_emb_col = tf.feature_column.embedding_column(video_col, 64)
  42. video_ind_col = tf.feature_column.indicator_column(video_col) # videoid id indicator columns
  43. # user id embedding feature
  44. user_col = tf.feature_column.categorical_column_with_identity(key='userid', num_buckets=1600000)
  45. user_emb_col = tf.feature_column.embedding_column(user_col, 64)
  46. user_ind_col = tf.feature_column.indicator_column(user_col) # user id indicator columns
  47. fm_first_order_columns = [video_ind_col, user_ind_col]
  48. deep_feature_columns = [
  49. tf.feature_column.numeric_column('videoAvgRating'),
  50. tf.feature_column.numeric_column('videoRatingStddev'),
  51. # tf.feature_column.numeric_column('userRatingCount'),
  52. tf.feature_column.numeric_column('userAvgRating'),
  53. tf.feature_column.numeric_column('userRatingStddev'),
  54. # userRealplayCount,videoRealPlayCount,videoDuration
  55. tf.feature_column.numeric_column("userRealplayCount"),
  56. tf.feature_column.numeric_column("videoRealPlayCount"),
  57. tf.feature_column.numeric_column("videoDuration"),
  58. video_emb_col,
  59. user_emb_col]
  60. item_emb_layer = tf.keras.layers.DenseFeatures([video_emb_col])(inputs)
  61. user_emb_layer = tf.keras.layers.DenseFeatures([user_emb_col])(inputs)
  62. # The first-order term in the FM layer
  63. fm_first_order_layer = tf.keras.layers.DenseFeatures(fm_first_order_columns)(inputs)
  64. # FM part, cross different categorical feature embeddings
  65. product_layer_item_user = tf.keras.layers.Dot(axes=1)([item_emb_layer, user_emb_layer])
  66. # deep part, MLP to generalize all input features
  67. deep = tf.keras.layers.DenseFeatures(deep_feature_columns)(inputs)
  68. deep = tf.keras.layers.Dense(64, activation='relu')(deep)
  69. deep = tf.keras.layers.Dense(64, activation='relu')(deep)
  70. # concatenate fm part and deep part
  71. concat_layer = tf.keras.layers.concatenate([fm_first_order_layer, product_layer_item_user, deep], axis=1)
  72. output_layer = tf.keras.layers.Dense(1, activation='sigmoid')(concat_layer)
  73. model = tf.keras.Model(inputs, output_layer)
  74. # compile the model, set loss function, optimizer and evaluation metrics
  75. model.compile(
  76. loss='binary_crossentropy',
  77. optimizer='adam',
  78. metrics=['accuracy', tf.keras.metrics.AUC(curve='ROC'), tf.keras.metrics.AUC(curve='PR')])
  79. model.fit(train_dataset, epochs=5)
  80. import time
  81. saved_model_path = "./saved_models/{}".format(int(time.time()))
  82. tf.keras.models.save_model(model,
  83. "file:///root/xielixun/ctr_sort_model/deepfm_model_10dayApp",
  84. overwrite=True,
  85. include_optimizer=True,
  86. save_format=None,
  87. signatures=None,
  88. options=None)
  89. # evaluate the model
  90. test_loss, test_accuracy, test_roc_auc, test_pr_auc = model.evaluate(train_dataset)
  91. print('\n\nTest Loss {}, Test Accuracy {}, Test ROC AUC {}, Test PR AUC {}'.format(test_loss, test_accuracy,
  92. test_roc_auc, test_pr_auc))
  93. print("deepFM training cost time is: " + str(time.time() - begin_time))