widedeep_v12_5.py 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254
  1. #! /usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. # vim:fenc=utf-8
  4. #
  5. # Copyright © 2025 StrayWarrior <i@straywarrior.com>
  6. #
  7. # Distributed under terms of the MIT license.
  8. """
  9. 删除容易导致偏差的viewall特征
  10. """
  11. raw_input = open("data_fields_v3.config").readlines()
  12. input_fields = dict(
  13. map(lambda x: (x[0], x[1]),
  14. map(lambda x: x.strip().split(' '), raw_input)))
  15. def read_features(filename, excludes=None):
  16. features = open(filename).readlines()
  17. features = [name.strip().lower() for name in features]
  18. if excludes:
  19. for x in excludes:
  20. if x in features:
  21. features.remove(x)
  22. return features
  23. exclude_features = ['viewall', "e1_tags_14d_maxscore","e2_tags_14d_avgscore","e2_tags_14d_maxscore","e1_tags_14d_avgscore","e2_tags_7d_maxscore","e2_tags_7d_avgscore","e2_tags_3d_avgscore","e1_tags_3d_maxscore","e1_tags_7d_maxscore","e2_tags_3d_maxscore","e1_tags_3d_avgscore","e1_tags_7d_avgscore"
  24. ]
  25. dense_features = read_features("features_top300.config", exclude_features)
  26. top_dense_features = read_features('features_top100.config', exclude_features)
  27. sparse_features = [
  28. "cid", "adid", "adverid",
  29. "region", "city", "brand",
  30. "vid", "cate1", "cate2",
  31. "apptype", "hour", "hour_quarter", "root_source_scene", "root_source_channel", "is_first_layer", "title_split",
  32. "profession"
  33. ]
  34. tag_features = [
  35. "user_vid_return_tags_2h", "user_vid_return_tags_1d", "user_vid_return_tags_3d",
  36. "user_vid_return_tags_7d", "user_vid_return_tags_14d"
  37. ]
  38. seq_features = [
  39. "user_cid_click_list", "user_cid_conver_list"
  40. ]
  41. input_type_map = {
  42. 'BIGINT': 'INT64',
  43. 'DOUBLE': 'DOUBLE',
  44. 'STRING': 'STRING'
  45. }
  46. print("""train_config {
  47. optimizer_config {
  48. adam_optimizer {
  49. learning_rate {
  50. constant_learning_rate {
  51. learning_rate: 0.0010
  52. }
  53. }
  54. }
  55. use_moving_average: false
  56. }
  57. optimizer_config {
  58. adam_optimizer {
  59. learning_rate {
  60. constant_learning_rate {
  61. learning_rate: 0.0006
  62. }
  63. }
  64. }
  65. use_moving_average: false
  66. }
  67. optimizer_config {
  68. adam_optimizer {
  69. learning_rate {
  70. constant_learning_rate {
  71. learning_rate: 0.002
  72. }
  73. }
  74. }
  75. use_moving_average: false
  76. }
  77. num_steps: 200000
  78. sync_replicas: true
  79. save_checkpoints_steps: 1100
  80. log_step_count_steps: 100
  81. save_summary_steps: 100
  82. }
  83. eval_config {
  84. metrics_set {
  85. auc {
  86. }
  87. }
  88. eval_online: true
  89. eval_interval_secs: 120
  90. }
  91. data_config {
  92. batch_size: 512
  93. num_epochs: 1
  94. """)
  95. for name in input_fields:
  96. input_type = input_type_map[input_fields[name]]
  97. default_spec = ''
  98. if name in dense_features:
  99. default_spec = '\n default_val: "0"'
  100. print(f""" input_fields {{
  101. input_name: "{name}"
  102. input_type: {input_type}{default_spec}
  103. }}""")
  104. # default_val: "0"
  105. print(""" label_fields: "has_conversion"
  106. prefetch_size: 32
  107. input_type: OdpsInputV2
  108. }
  109. """)
  110. for name in dense_features:
  111. print(f"""feature_configs {{
  112. input_names: "{name}"
  113. feature_type: RawFeature
  114. boundaries: [0.0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2, 0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3, 0.31, 0.32, 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4, 0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5, 0.51, 0.52, 0.53, 0.54, 0.55, 0.56, 0.57, 0.58, 0.59, 0.6, 0.61, 0.62, 0.63, 0.64, 0.65, 0.66, 0.67, 0.68, 0.69, 0.7, 0.71, 0.72, 0.73, 0.74, 0.75, 0.76, 0.77, 0.78, 0.79, 0.8, 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87, 0.88, 0.89, 0.9, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99, 1.0]
  115. embedding_dim: 6
  116. }}""")
  117. for name in sparse_features:
  118. print(f"""feature_configs {{
  119. input_names: "{name}"
  120. feature_type: IdFeature
  121. hash_bucket_size: 1000000
  122. embedding_dim: 6
  123. }}""")
  124. for name in tag_features + seq_features:
  125. print(f"""feature_configs {{
  126. input_names: "{name}"
  127. feature_type: TagFeature
  128. hash_bucket_size: 1000000
  129. embedding_dim: 6
  130. separator: ','
  131. }}""")
  132. def wide_and_deep():
  133. print("""
  134. model_config {
  135. model_class: "WideAndDeep"
  136. feature_groups: {
  137. group_name: 'wide'""")
  138. for name in dense_features + sparse_features:
  139. print(f""" feature_names: '{name}'""")
  140. print(""" wide_deep: WIDE
  141. }
  142. feature_groups: {
  143. group_name: 'deep'""")
  144. for name in dense_features + sparse_features + tag_features + seq_features:
  145. print(f""" feature_names: '{name}'""")
  146. print(""" wide_deep: DEEP
  147. }
  148. wide_and_deep {
  149. wide_output_dim: 8
  150. dnn {
  151. hidden_units: [256, 128, 64]
  152. }
  153. final_dnn {
  154. hidden_units: [64, 32]
  155. }
  156. l2_regularization: 1e-5
  157. }
  158. embedding_regularization: 1e-6
  159. }""")
  160. def deep_fm():
  161. print("""
  162. model_config {
  163. model_class: "DeepFM"
  164. feature_groups: {
  165. group_name: 'wide'""")
  166. for name in dense_features + sparse_features:
  167. print(f""" feature_names: '{name}'""")
  168. print(""" wide_deep: WIDE
  169. }
  170. feature_groups: {
  171. group_name: 'deep'""")
  172. for name in top_dense_features + sparse_features + tag_features + seq_features:
  173. print(f""" feature_names: '{name}'""")
  174. print(""" wide_deep: DEEP
  175. }
  176. deepfm {
  177. wide_output_dim: 8
  178. dnn {
  179. hidden_units: [256, 128, 64]
  180. }
  181. final_dnn {
  182. hidden_units: [64, 32]
  183. }
  184. l2_regularization: 1e-5
  185. }
  186. embedding_regularization: 1e-6
  187. }""")
  188. def fm():
  189. print("""
  190. model_config {
  191. model_class: "FM"
  192. feature_groups: {
  193. group_name: 'wide'""")
  194. for name in dense_features:
  195. print(f""" feature_names: '{name}'""")
  196. print(""" wide_deep: WIDE
  197. }
  198. feature_groups: {
  199. group_name: 'deep'""")
  200. for name in dense_features:
  201. print(f""" feature_names: '{name}'""")
  202. print(""" wide_deep: DEEP
  203. }
  204. fm {
  205. }
  206. embedding_regularization: 1e-5
  207. }""")
  208. def config_export():
  209. print("""
  210. export_config {
  211. exporter_type: "final"
  212. }
  213. """)
  214. deep_fm()
  215. config_export()