widedeep_v13_6.py 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292
  1. #! /usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. # vim:fenc=utf-8
  4. #
  5. # Copyright © 2025 StrayWarrior <i@straywarrior.com>
  6. #
  7. # Distributed under terms of the MIT license.
  8. """
  9. 1.删除容易导致偏差的viewall特征
  10. 2.删除分桶不均匀的cpa特征
  11. 3.减少dense特征
  12. 4.增加U-I交叉统计
  13. 5.增加线性部分dense
  14. 6.减少wide部分embedding
  15. 7.减少部分bucket size
  16. """
  17. raw_input = open("data_fields_v6.config").readlines()
  18. input_fields = dict(
  19. map(lambda x: (x[0], x[1]),
  20. map(lambda x: x.strip().split(' '), raw_input)))
  21. def read_features(filename, excludes=None):
  22. features = open(filename).readlines()
  23. features = [name.strip().lower() for name in features]
  24. if excludes:
  25. for x in excludes:
  26. if x in features:
  27. features.remove(x)
  28. return features
  29. exclude_features = ['viewall', 'cpa']
  30. dense_features = read_features("features_top300_new.config", exclude_features)
  31. top_dense_features = read_features('features_top50_new.config', exclude_features)
  32. sparse_features = [
  33. "cid", "adid", "adverid",
  34. "region", "city", "brand",
  35. "vid", "cate1", "cate2",
  36. "apptype", "hour", "hour_quarter", "root_source_scene", "root_source_channel", "is_first_layer", "title_split",
  37. "user_has_conver_1y",
  38. "user_adverid_view_3d", "user_adverid_view_7d", "user_adverid_view_30d",
  39. "user_adverid_click_3d", "user_adverid_click_7d", "user_adverid_click_30d",
  40. "user_adverid_conver_3d", "user_adverid_conver_7d", "user_adverid_conver_30d",
  41. "user_skuid_view_3d", "user_skuid_view_7d", "user_skuid_view_30d",
  42. "user_skuid_click_3d", "user_skuid_click_7d", "user_skuid_click_30d",
  43. "user_skuid_conver_3d", "user_skuid_conver_7d", "user_skuid_conver_30d",
  44. "profession", "user_layer", "landing", "flag",
  45. # "category_name", "customer"
  46. ]
  47. tag_features = [
  48. "user_vid_return_tags_2h", "user_vid_return_tags_1d", "user_vid_return_tags_3d",
  49. "user_vid_return_tags_7d", "user_vid_return_tags_14d"
  50. ]
  51. seq_features = [
  52. "user_cid_click_list", "user_cid_conver_list"
  53. ]
  54. input_type_map = {
  55. 'BIGINT': 'INT64',
  56. 'DOUBLE': 'DOUBLE',
  57. 'STRING': 'STRING'
  58. }
  59. bucket_size_map = {
  60. 'adverid': 100000,
  61. 'region': 1000,
  62. 'city': 10000,
  63. 'brand': 10000,
  64. 'cate1': 10000,
  65. 'cate2': 10000,
  66. 'apptype': 1000,
  67. 'hour': 1000, # 实际上可以直接指定词表
  68. 'hour_quarter': 4000,
  69. 'root_source_scene': 100,
  70. 'root_source_channel': 1000,
  71. 'is_first_layer': 100,
  72. 'user_has_conver_1y': 100,
  73. 'profession': 500,
  74. 'category_name': 500,
  75. 'user_layer': 200,
  76. 'customer': 3000,
  77. 'landing': 100,
  78. 'flag': 50,
  79. }
  80. print("""train_config {
  81. optimizer_config {
  82. adam_optimizer {
  83. learning_rate {
  84. constant_learning_rate {
  85. learning_rate: 0.0010
  86. }
  87. }
  88. }
  89. use_moving_average: false
  90. }
  91. optimizer_config {
  92. adam_optimizer {
  93. learning_rate {
  94. constant_learning_rate {
  95. learning_rate: 0.0006
  96. }
  97. }
  98. }
  99. use_moving_average: false
  100. }
  101. optimizer_config {
  102. adam_optimizer {
  103. learning_rate {
  104. constant_learning_rate {
  105. learning_rate: 0.002
  106. }
  107. }
  108. }
  109. use_moving_average: false
  110. }
  111. num_steps: 200000
  112. sync_replicas: true
  113. save_checkpoints_steps: 1000
  114. log_step_count_steps: 200
  115. save_summary_steps: 200
  116. }
  117. eval_config {
  118. metrics_set {
  119. auc {
  120. }
  121. }
  122. eval_online: true
  123. eval_interval_secs: 1200
  124. }
  125. data_config {
  126. batch_size: 512
  127. num_epochs: 1
  128. """)
  129. for name in input_fields:
  130. input_type = input_type_map[input_fields[name]]
  131. default_spec = ''
  132. if name in dense_features:
  133. default_spec = '\n default_val: "0"'
  134. print(f""" input_fields {{
  135. input_name: "{name}"
  136. input_type: {input_type}{default_spec}
  137. }}""")
  138. # default_val: "0"
  139. print(""" label_fields: "has_conversion"
  140. prefetch_size: 32
  141. input_type: OdpsInputV2
  142. }
  143. """)
  144. for name in dense_features:
  145. print(f"""feature_configs {{
  146. input_names: "{name}"
  147. feature_type: RawFeature
  148. boundaries: [0.0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2, 0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3, 0.31, 0.32, 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4, 0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5, 0.51, 0.52, 0.53, 0.54, 0.55, 0.56, 0.57, 0.58, 0.59, 0.6, 0.61, 0.62, 0.63, 0.64, 0.65, 0.66, 0.67, 0.68, 0.69, 0.7, 0.71, 0.72, 0.73, 0.74, 0.75, 0.76, 0.77, 0.78, 0.79, 0.8, 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87, 0.88, 0.89, 0.9, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99, 1.0]
  149. embedding_dim: 6
  150. }}""")
  151. for name in sparse_features:
  152. bucket_size = bucket_size_map.get(name, 1000000)
  153. print(f"""feature_configs {{
  154. input_names: "{name}"
  155. feature_type: IdFeature
  156. hash_bucket_size: {bucket_size}
  157. embedding_dim: 6
  158. }}""")
  159. for name in tag_features + seq_features:
  160. bucket_size = bucket_size_map.get(name, 1000000)
  161. print(f"""feature_configs {{
  162. input_names: "{name}"
  163. feature_type: TagFeature
  164. hash_bucket_size: {bucket_size}
  165. embedding_dim: 6
  166. separator: ','
  167. }}""")
  168. def wide_and_deep():
  169. print("""
  170. model_config {
  171. model_class: "WideAndDeep"
  172. feature_groups: {
  173. group_name: 'wide'""")
  174. for name in dense_features + sparse_features:
  175. print(f""" feature_names: '{name}'""")
  176. print(""" wide_deep: WIDE
  177. }
  178. feature_groups: {
  179. group_name: 'deep'""")
  180. for name in dense_features + sparse_features + tag_features + seq_features:
  181. print(f""" feature_names: '{name}'""")
  182. print(""" wide_deep: DEEP
  183. }
  184. wide_and_deep {
  185. wide_output_dim: 8
  186. dnn {
  187. hidden_units: [256, 128, 64]
  188. }
  189. final_dnn {
  190. hidden_units: [64, 32]
  191. }
  192. l2_regularization: 1e-5
  193. }
  194. embedding_regularization: 1e-6
  195. }""")
  196. def deep_fm():
  197. print("""
  198. model_config {
  199. model_class: "DeepFM"
  200. feature_groups: {
  201. group_name: 'wide'""")
  202. for name in dense_features + sparse_features:
  203. print(f""" feature_names: '{name}'""")
  204. print(""" wide_deep: WIDE
  205. }
  206. feature_groups: {
  207. group_name: 'deep'""")
  208. for name in top_dense_features + sparse_features + tag_features + seq_features:
  209. print(f""" feature_names: '{name}'""")
  210. print(""" wide_deep: DEEP
  211. }
  212. deepfm {
  213. wide_output_dim: 2
  214. dnn {
  215. hidden_units: [256, 128, 64]
  216. }
  217. final_dnn {
  218. hidden_units: [64, 32]
  219. }
  220. l2_regularization: 1e-5
  221. }
  222. embedding_regularization: 1e-6
  223. }""")
  224. def fm():
  225. print("""
  226. model_config {
  227. model_class: "FM"
  228. feature_groups: {
  229. group_name: 'wide'""")
  230. for name in dense_features:
  231. print(f""" feature_names: '{name}'""")
  232. print(""" wide_deep: WIDE
  233. }
  234. feature_groups: {
  235. group_name: 'deep'""")
  236. for name in dense_features:
  237. print(f""" feature_names: '{name}'""")
  238. print(""" wide_deep: DEEP
  239. }
  240. fm {
  241. }
  242. embedding_regularization: 1e-5
  243. }""")
  244. def config_export():
  245. print("""
  246. export_config {
  247. exporter_type: "final"
  248. }
  249. """)
  250. deep_fm()
  251. config_export()