|
@@ -0,0 +1,254 @@
|
|
|
|
+#! /usr/bin/env python
|
|
|
|
+# -*- coding: utf-8 -*-
|
|
|
|
+# vim:fenc=utf-8
|
|
|
|
+#
|
|
|
|
+# Copyright © 2025 StrayWarrior <i@straywarrior.com>
|
|
|
|
+#
|
|
|
|
+# Distributed under terms of the MIT license.
|
|
|
|
+
|
|
|
|
+"""
|
|
|
|
+删除容易导致偏差的viewall特征
|
|
|
|
+"""
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+raw_input = open("data_fields_v3.config").readlines()
|
|
|
|
+input_fields = dict(
|
|
|
|
+ map(lambda x: (x[0], x[1]),
|
|
|
|
+ map(lambda x: x.strip().split(' '), raw_input)))
|
|
|
|
+
|
|
|
|
+def read_features(filename, excludes=None):
|
|
|
|
+ features = open(filename).readlines()
|
|
|
|
+ features = [name.strip().lower() for name in features]
|
|
|
|
+ if excludes:
|
|
|
|
+ for x in excludes:
|
|
|
|
+ if x in features:
|
|
|
|
+ features.remove(x)
|
|
|
|
+ return features
|
|
|
|
+
|
|
|
|
+exclude_features = ['viewall', "e1_tags_14d_maxscore","e2_tags_14d_avgscore","e2_tags_14d_maxscore","e1_tags_14d_avgscore","e2_tags_7d_maxscore","e2_tags_7d_avgscore","e2_tags_3d_avgscore","e1_tags_3d_maxscore","e1_tags_7d_maxscore","e2_tags_3d_maxscore","e1_tags_3d_avgscore","e1_tags_7d_avgscore"
|
|
|
|
+]
|
|
|
|
+
|
|
|
|
+dense_features = read_features("features_top300.config", exclude_features)
|
|
|
|
+top_dense_features = read_features('features_top100.config', exclude_features)
|
|
|
|
+
|
|
|
|
+sparse_features = [
|
|
|
|
+ "cid", "adid", "adverid",
|
|
|
|
+ "region", "city", "brand",
|
|
|
|
+ "vid", "cate1", "cate2",
|
|
|
|
+ "apptype", "hour", "hour_quarter", "root_source_scene", "root_source_channel", "is_first_layer", "title_split",
|
|
|
|
+ "profession"
|
|
|
|
+]
|
|
|
|
+tag_features = [
|
|
|
|
+ "user_vid_return_tags_2h", "user_vid_return_tags_1d", "user_vid_return_tags_3d",
|
|
|
|
+ "user_vid_return_tags_7d", "user_vid_return_tags_14d"
|
|
|
|
+]
|
|
|
|
+seq_features = [
|
|
|
|
+ "user_cid_click_list", "user_cid_conver_list"
|
|
|
|
+]
|
|
|
|
+
|
|
|
|
+input_type_map = {
|
|
|
|
+ 'BIGINT': 'INT64',
|
|
|
|
+ 'DOUBLE': 'DOUBLE',
|
|
|
|
+ 'STRING': 'STRING'
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+print("""train_config {
|
|
|
|
+ optimizer_config {
|
|
|
|
+ adam_optimizer {
|
|
|
|
+ learning_rate {
|
|
|
|
+ constant_learning_rate {
|
|
|
|
+ learning_rate: 0.0010
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ use_moving_average: false
|
|
|
|
+ }
|
|
|
|
+ optimizer_config {
|
|
|
|
+ adam_optimizer {
|
|
|
|
+ learning_rate {
|
|
|
|
+ constant_learning_rate {
|
|
|
|
+ learning_rate: 0.0006
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ use_moving_average: false
|
|
|
|
+ }
|
|
|
|
+ optimizer_config {
|
|
|
|
+ adam_optimizer {
|
|
|
|
+ learning_rate {
|
|
|
|
+ constant_learning_rate {
|
|
|
|
+ learning_rate: 0.002
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ use_moving_average: false
|
|
|
|
+ }
|
|
|
|
+ num_steps: 200000
|
|
|
|
+ sync_replicas: true
|
|
|
|
+ save_checkpoints_steps: 1100
|
|
|
|
+ log_step_count_steps: 100
|
|
|
|
+ save_summary_steps: 100
|
|
|
|
+}
|
|
|
|
+eval_config {
|
|
|
|
+ metrics_set {
|
|
|
|
+ auc {
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ eval_online: true
|
|
|
|
+ eval_interval_secs: 120
|
|
|
|
+}
|
|
|
|
+data_config {
|
|
|
|
+ batch_size: 512
|
|
|
|
+ num_epochs: 1
|
|
|
|
+""")
|
|
|
|
+
|
|
|
|
+for name in input_fields:
|
|
|
|
+ input_type = input_type_map[input_fields[name]]
|
|
|
|
+ default_spec = ''
|
|
|
|
+ if name in dense_features:
|
|
|
|
+ default_spec = '\n default_val: "0"'
|
|
|
|
+ print(f""" input_fields {{
|
|
|
|
+ input_name: "{name}"
|
|
|
|
+ input_type: {input_type}{default_spec}
|
|
|
|
+ }}""")
|
|
|
|
+ # default_val: "0"
|
|
|
|
+
|
|
|
|
+print(""" label_fields: "has_conversion"
|
|
|
|
+ prefetch_size: 32
|
|
|
|
+ input_type: OdpsInputV2
|
|
|
|
+}
|
|
|
|
+""")
|
|
|
|
+
|
|
|
|
+for name in dense_features:
|
|
|
|
+ print(f"""feature_configs {{
|
|
|
|
+ input_names: "{name}"
|
|
|
|
+ feature_type: RawFeature
|
|
|
|
+ boundaries: [0.0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2, 0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3, 0.31, 0.32, 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4, 0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5, 0.51, 0.52, 0.53, 0.54, 0.55, 0.56, 0.57, 0.58, 0.59, 0.6, 0.61, 0.62, 0.63, 0.64, 0.65, 0.66, 0.67, 0.68, 0.69, 0.7, 0.71, 0.72, 0.73, 0.74, 0.75, 0.76, 0.77, 0.78, 0.79, 0.8, 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87, 0.88, 0.89, 0.9, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99, 1.0]
|
|
|
|
+ embedding_dim: 6
|
|
|
|
+}}""")
|
|
|
|
+
|
|
|
|
+for name in sparse_features:
|
|
|
|
+ print(f"""feature_configs {{
|
|
|
|
+ input_names: "{name}"
|
|
|
|
+ feature_type: IdFeature
|
|
|
|
+ hash_bucket_size: 1000000
|
|
|
|
+ embedding_dim: 6
|
|
|
|
+}}""")
|
|
|
|
+
|
|
|
|
+for name in tag_features + seq_features:
|
|
|
|
+ print(f"""feature_configs {{
|
|
|
|
+ input_names: "{name}"
|
|
|
|
+ feature_type: TagFeature
|
|
|
|
+ hash_bucket_size: 1000000
|
|
|
|
+ embedding_dim: 6
|
|
|
|
+ separator: ','
|
|
|
|
+}}""")
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def wide_and_deep():
|
|
|
|
+ print("""
|
|
|
|
+model_config {
|
|
|
|
+ model_class: "WideAndDeep"
|
|
|
|
+ feature_groups: {
|
|
|
|
+ group_name: 'wide'""")
|
|
|
|
+
|
|
|
|
+ for name in dense_features + sparse_features:
|
|
|
|
+ print(f""" feature_names: '{name}'""")
|
|
|
|
+
|
|
|
|
+ print(""" wide_deep: WIDE
|
|
|
|
+ }
|
|
|
|
+ feature_groups: {
|
|
|
|
+ group_name: 'deep'""")
|
|
|
|
+
|
|
|
|
+ for name in dense_features + sparse_features + tag_features + seq_features:
|
|
|
|
+ print(f""" feature_names: '{name}'""")
|
|
|
|
+
|
|
|
|
+ print(""" wide_deep: DEEP
|
|
|
|
+ }
|
|
|
|
+ wide_and_deep {
|
|
|
|
+ wide_output_dim: 8
|
|
|
|
+
|
|
|
|
+ dnn {
|
|
|
|
+ hidden_units: [256, 128, 64]
|
|
|
|
+ }
|
|
|
|
+ final_dnn {
|
|
|
|
+ hidden_units: [64, 32]
|
|
|
|
+ }
|
|
|
|
+ l2_regularization: 1e-5
|
|
|
|
+ }
|
|
|
|
+ embedding_regularization: 1e-6
|
|
|
|
+}""")
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def deep_fm():
|
|
|
|
+ print("""
|
|
|
|
+model_config {
|
|
|
|
+ model_class: "DeepFM"
|
|
|
|
+ feature_groups: {
|
|
|
|
+ group_name: 'wide'""")
|
|
|
|
+
|
|
|
|
+ for name in dense_features + sparse_features:
|
|
|
|
+ print(f""" feature_names: '{name}'""")
|
|
|
|
+
|
|
|
|
+ print(""" wide_deep: WIDE
|
|
|
|
+ }
|
|
|
|
+ feature_groups: {
|
|
|
|
+ group_name: 'deep'""")
|
|
|
|
+
|
|
|
|
+ for name in top_dense_features + sparse_features + tag_features + seq_features:
|
|
|
|
+ print(f""" feature_names: '{name}'""")
|
|
|
|
+
|
|
|
|
+ print(""" wide_deep: DEEP
|
|
|
|
+ }
|
|
|
|
+ deepfm {
|
|
|
|
+ wide_output_dim: 8
|
|
|
|
+
|
|
|
|
+ dnn {
|
|
|
|
+ hidden_units: [256, 128, 64]
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ final_dnn {
|
|
|
|
+ hidden_units: [64, 32]
|
|
|
|
+ }
|
|
|
|
+ l2_regularization: 1e-5
|
|
|
|
+ }
|
|
|
|
+ embedding_regularization: 1e-6
|
|
|
|
+}""")
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def fm():
|
|
|
|
+ print("""
|
|
|
|
+model_config {
|
|
|
|
+ model_class: "FM"
|
|
|
|
+ feature_groups: {
|
|
|
|
+ group_name: 'wide'""")
|
|
|
|
+
|
|
|
|
+ for name in dense_features:
|
|
|
|
+ print(f""" feature_names: '{name}'""")
|
|
|
|
+
|
|
|
|
+ print(""" wide_deep: WIDE
|
|
|
|
+ }
|
|
|
|
+ feature_groups: {
|
|
|
|
+ group_name: 'deep'""")
|
|
|
|
+
|
|
|
|
+ for name in dense_features:
|
|
|
|
+ print(f""" feature_names: '{name}'""")
|
|
|
|
+
|
|
|
|
+ print(""" wide_deep: DEEP
|
|
|
|
+ }
|
|
|
|
+ fm {
|
|
|
|
+ }
|
|
|
|
+ embedding_regularization: 1e-5
|
|
|
|
+}""")
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def config_export():
|
|
|
|
+ print("""
|
|
|
|
+export_config {
|
|
|
|
+ exporter_type: "final"
|
|
|
|
+}
|
|
|
|
+""")
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+deep_fm()
|
|
|
|
+config_export()
|