widedeep_v12_1.py 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253
  1. #! /usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. # vim:fenc=utf-8
  4. #
  5. # Copyright © 2025 StrayWarrior <i@straywarrior.com>
  6. #
  7. # Distributed under terms of the MIT license.
  8. raw_input = open("data_fields_v3.config").readlines()
  9. input_fields = dict(
  10. map(lambda x: (x[0], x[1]),
  11. map(lambda x: x.strip().split(' '), raw_input)))
  12. dense_features = open("features_top300.config").readlines()
  13. dense_features = [name.strip().lower() for name in dense_features]
  14. top_dense_features = open('features_top100.config').readlines()
  15. top_dense_features = [name.strip().lower() for name in top_dense_features]
  16. sparse_features = [
  17. "cid", "adid", "adverid",
  18. "region", "city", "brand",
  19. "vid", "cate1", "cate2",
  20. "apptype", "hour", "hour_quarter", "root_source_scene", "root_source_channel", "is_first_layer", "title_split",
  21. "profession", "creative_type"
  22. ]
  23. tag_features = [
  24. "user_vid_return_tags_2h", "user_vid_return_tags_1d", "user_vid_return_tags_3d",
  25. "user_vid_return_tags_7d", "user_vid_return_tags_14d", "user_vid_share_tags_1d", "user_vid_share_tags_14d",
  26. "user_vid_share_tags_1d", "user_vid_share_tags_14d", "user_vid_return_cate1_14d", "user_vid_return_cate2_14d",
  27. "user_vid_share_cate1_14d", "user_vid_share_cate2_14d"
  28. ]
  29. seq_features = [
  30. "user_cid_click_list", "user_cid_conver_list"
  31. ]
  32. nlp_features = [
  33. "creative_hook_embedding", "creative_why_embedding", "creative_action_embedding"
  34. ]
  35. input_type_map = {
  36. 'BIGINT': 'INT64',
  37. 'DOUBLE': 'DOUBLE',
  38. 'STRING': 'STRING'
  39. }
  40. print("""train_config {
  41. optimizer_config {
  42. adam_optimizer {
  43. learning_rate {
  44. constant_learning_rate {
  45. learning_rate: 0.0010
  46. }
  47. }
  48. }
  49. use_moving_average: false
  50. }
  51. optimizer_config {
  52. adam_optimizer {
  53. learning_rate {
  54. constant_learning_rate {
  55. learning_rate: 0.0006
  56. }
  57. }
  58. }
  59. use_moving_average: false
  60. }
  61. optimizer_config {
  62. adam_optimizer {
  63. learning_rate {
  64. constant_learning_rate {
  65. learning_rate: 0.002
  66. }
  67. }
  68. }
  69. use_moving_average: false
  70. }
  71. num_steps: 200000
  72. sync_replicas: true
  73. save_checkpoints_steps: 1100
  74. log_step_count_steps: 100
  75. save_summary_steps: 100
  76. }
  77. eval_config {
  78. metrics_set {
  79. auc {
  80. }
  81. }
  82. eval_online: true
  83. eval_interval_secs: 120
  84. }
  85. data_config {
  86. batch_size: 512
  87. num_epochs: 1
  88. """)
  89. for name in input_fields:
  90. input_type = input_type_map[input_fields[name]]
  91. default_spec = ''
  92. if name in dense_features:
  93. default_spec = '\n default_val: "0"'
  94. print(f""" input_fields {{
  95. input_name: "{name}"
  96. input_type: {input_type}{default_spec}
  97. }}""")
  98. # default_val: "0"
  99. print(""" label_fields: "has_conversion"
  100. prefetch_size: 32
  101. input_type: OdpsInputV2
  102. }
  103. """)
  104. for name in dense_features:
  105. print(f"""feature_configs {{
  106. input_names: "{name}"
  107. feature_type: RawFeature
  108. boundaries: [0.0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2, 0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3, 0.31, 0.32, 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4, 0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5, 0.51, 0.52, 0.53, 0.54, 0.55, 0.56, 0.57, 0.58, 0.59, 0.6, 0.61, 0.62, 0.63, 0.64, 0.65, 0.66, 0.67, 0.68, 0.69, 0.7, 0.71, 0.72, 0.73, 0.74, 0.75, 0.76, 0.77, 0.78, 0.79, 0.8, 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87, 0.88, 0.89, 0.9, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99, 1.0]
  109. embedding_dim: 6
  110. }}""")
  111. for name in sparse_features:
  112. print(f"""feature_configs {{
  113. input_names: "{name}"
  114. feature_type: IdFeature
  115. hash_bucket_size: 1000000
  116. embedding_dim: 6
  117. }}""")
  118. for name in tag_features + seq_features:
  119. print(f"""feature_configs {{
  120. input_names: "{name}"
  121. feature_type: TagFeature
  122. hash_bucket_size: 1000000
  123. embedding_dim: 6
  124. separator: ','
  125. }}""")
  126. for name in nlp_features:
  127. print(f"""feature_configs {{
  128. input_names: "{name}"
  129. feature_type: TagFeature
  130. hash_bucket_size: 1000000
  131. embedding_dim: 6
  132. separator: '|'
  133. }}""")
  134. def wide_and_deep():
  135. print("""
  136. model_config {
  137. model_class: "WideAndDeep"
  138. feature_groups: {
  139. group_name: 'wide'""")
  140. for name in dense_features + sparse_features:
  141. print(f""" feature_names: '{name}'""")
  142. print(""" wide_deep: WIDE
  143. }
  144. feature_groups: {
  145. group_name: 'deep'""")
  146. for name in dense_features + sparse_features + tag_features + seq_features + nlp_features:
  147. print(f""" feature_names: '{name}'""")
  148. print(""" wide_deep: DEEP
  149. }
  150. wide_and_deep {
  151. wide_output_dim: 8
  152. dnn {
  153. hidden_units: [256, 128, 64]
  154. }
  155. final_dnn {
  156. hidden_units: [64, 32]
  157. }
  158. l2_regularization: 1e-5
  159. }
  160. embedding_regularization: 1e-6
  161. }""")
  162. def deep_fm():
  163. print("""
  164. model_config {
  165. model_class: "DeepFM"
  166. feature_groups: {
  167. group_name: 'wide'""")
  168. for name in dense_features + sparse_features:
  169. print(f""" feature_names: '{name}'""")
  170. print(""" wide_deep: WIDE
  171. }
  172. feature_groups: {
  173. group_name: 'deep'""")
  174. for name in top_dense_features + sparse_features + tag_features + seq_features + nlp_features:
  175. print(f""" feature_names: '{name}'""")
  176. print(""" wide_deep: DEEP
  177. }
  178. deepfm {
  179. wide_output_dim: 8
  180. dnn {
  181. hidden_units: [256, 128, 64]
  182. }
  183. final_dnn {
  184. hidden_units: [64, 32]
  185. }
  186. l2_regularization: 1e-5
  187. }
  188. embedding_regularization: 1e-6
  189. }""")
  190. def fm():
  191. print("""
  192. model_config {
  193. model_class: "FM"
  194. feature_groups: {
  195. group_name: 'wide'""")
  196. for name in dense_features:
  197. print(f""" feature_names: '{name}'""")
  198. print(""" wide_deep: WIDE
  199. }
  200. feature_groups: {
  201. group_name: 'deep'""")
  202. for name in dense_features:
  203. print(f""" feature_names: '{name}'""")
  204. print(""" wide_deep: DEEP
  205. }
  206. fm {
  207. }
  208. embedding_regularization: 1e-5
  209. }""")
  210. def config_export():
  211. print("""
  212. export_config {
  213. exporter_type: "final"
  214. }
  215. """)
  216. deep_fm()
  217. config_export()