123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233 |
- #! /usr/bin/env python
- # -*- coding: utf-8 -*-
- # vim:fenc=utf-8
- #
- # Copyright © 2025 StrayWarrior <i@straywarrior.com>
- #
- # Distributed under terms of the MIT license.
- raw_input = open("data_fields_v3.config").readlines()
- input_fields = dict(
- map(lambda x: (x[0], x[1]),
- map(lambda x: x.strip().split(' '), raw_input)))
- dense_features = open("features_top300.config").readlines()
- dense_features = [name.strip().lower() for name in dense_features]
- top_dense_features = open('features_top100.config').readlines()
- top_dense_features = [name.strip().lower() for name in top_dense_features]
- sparse_features = [
- "cid", "adid", "adverid",
- "region", "city", "brand",
- "vid", "cate1", "cate2",
- ]
- tag_features = [
- "user_vid_return_tags_2h", "user_vid_return_tags_1d", "user_vid_return_tags_3d",
- "user_vid_return_tags_7d", "user_vid_return_tags_14d"
- ]
- seq_features = [
- "user_cid_click_list", "user_cid_conver_list"
- ]
- input_type_map = {
- 'BIGINT': 'INT64',
- 'DOUBLE': 'DOUBLE',
- 'STRING': 'STRING'
- }
- print("""train_config {
- optimizer_config {
- adam_optimizer {
- learning_rate {
- constant_learning_rate {
- learning_rate: 0.0010
- }
- }
- }
- use_moving_average: false
- }
- optimizer_config {
- adam_optimizer {
- learning_rate {
- constant_learning_rate {
- learning_rate: 0.0006
- }
- }
- }
- use_moving_average: false
- }
- optimizer_config {
- adam_optimizer {
- learning_rate {
- constant_learning_rate {
- learning_rate: 0.002
- }
- }
- }
- use_moving_average: false
- }
- num_steps: 200000
- sync_replicas: true
- save_checkpoints_steps: 1100
- log_step_count_steps: 100
- save_summary_steps: 100
- }
- eval_config {
- metrics_set {
- auc {
- }
- }
- eval_online: true
- eval_interval_secs: 120
- }
- data_config {
- batch_size: 512
- num_epochs: 1
- """)
- for name in input_fields:
- input_type = input_type_map[input_fields[name]]
- default_spec = ''
- if name in dense_features:
- default_spec = '\n default_val: "0"'
- print(f""" input_fields {{
- input_name: "{name}"
- input_type: {input_type}{default_spec}
- }}""")
- # default_val: "0"
- print(""" label_fields: "has_conversion"
- prefetch_size: 32
- input_type: OdpsInputV2
- }
- """)
- for name in dense_features:
- print(f"""feature_configs {{
- input_names: "{name}"
- feature_type: RawFeature
- boundaries: [0.0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2, 0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3, 0.31, 0.32, 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4, 0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5, 0.51, 0.52, 0.53, 0.54, 0.55, 0.56, 0.57, 0.58, 0.59, 0.6, 0.61, 0.62, 0.63, 0.64, 0.65, 0.66, 0.67, 0.68, 0.69, 0.7, 0.71, 0.72, 0.73, 0.74, 0.75, 0.76, 0.77, 0.78, 0.79, 0.8, 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87, 0.88, 0.89, 0.9, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99, 1.0]
- embedding_dim: 6
- }}""")
- for name in sparse_features:
- print(f"""feature_configs {{
- input_names: "{name}"
- feature_type: IdFeature
- hash_bucket_size: 1000000
- embedding_dim: 6
- }}""")
- for name in tag_features + seq_features:
- print(f"""feature_configs {{
- input_names: "{name}"
- feature_type: TagFeature
- hash_bucket_size: 1000000
- embedding_dim: 6
- separator: ','
- }}""")
- def wide_and_deep():
- print("""
- model_config {
- model_class: "WideAndDeep"
- feature_groups: {
- group_name: 'wide'""")
- for name in dense_features + sparse_features:
- print(f""" feature_names: '{name}'""")
- print(""" wide_deep: WIDE
- }
- feature_groups: {
- group_name: 'deep'""")
- for name in dense_features + sparse_features + tag_features + seq_features:
- print(f""" feature_names: '{name}'""")
- print(""" wide_deep: DEEP
- }
- wide_and_deep {
- wide_output_dim: 8
- dnn {
- hidden_units: [256, 128, 64]
- }
- final_dnn {
- hidden_units: [64, 32]
- }
- l2_regularization: 1e-5
- }
- embedding_regularization: 1e-6
- }""")
- def deep_fm():
- print("""
- model_config {
- model_class: "DeepFM"
- feature_groups: {
- group_name: 'wide'""")
- for name in dense_features + sparse_features:
- print(f""" feature_names: '{name}'""")
- print(""" wide_deep: WIDE
- }
- feature_groups: {
- group_name: 'deep'""")
- for name in top_dense_features + sparse_features + tag_features + seq_features:
- print(f""" feature_names: '{name}'""")
- print(""" wide_deep: DEEP
- }
- deepfm {
- wide_output_dim: 8
- dnn {
- hidden_units: [256, 128, 64]
- }
- final_dnn {
- hidden_units: [64, 32]
- }
- l2_regularization: 1e-5
- }
- embedding_regularization: 1e-6
- }""")
- def fm():
- print("""
- model_config {
- model_class: "FM"
- feature_groups: {
- group_name: 'wide'""")
- for name in dense_features:
- print(f""" feature_names: '{name}'""")
- print(""" wide_deep: WIDE
- }
- feature_groups: {
- group_name: 'deep'""")
- for name in dense_features:
- print(f""" feature_names: '{name}'""")
- print(""" wide_deep: DEEP
- }
- fm {
- }
- embedding_regularization: 1e-5
- }""")
- def config_export():
- print("""
- export_config {
- exporter_type: "final"
- }
- """)
- deep_fm()
- config_export()
|