SHA1
--- a/inspect_features.py
+++ b/inspect_features.py
@@ -0,0 +1,293 @@
 
				+#! /usr/bin/env python
			
 
				+# -*- coding: utf-8 -*-
			
 
				+# vim:fenc=utf-8
			
 
				+#
			
 
				+# Copyright © 2025 StrayWarrior <i@straywarrior.com>
			
 
				+
			
 
				+from eas_prediction import PredictClient
			
 
				+from eas_prediction import StringRequest
			
 
				+from eas_prediction import TFRequest
			
 
				+from odps import ODPS
			
 
				+import pandas as pd
			
 
				+import numpy as np
			
 
				+from sklearn.metrics import roc_auc_score
			
 
				+import time
			
 
				+import hashlib
			
 
				+import pdb
			
 
				+import sys
			
 
				+from q_plot_tool import draw_figures
			
 
				+
			
 
				+ODPS_CONFIG = {
			
 
				+    'ENDPOINT': 'http://service.cn.maxcompute.aliyun.com/api',
			
 
				+    'ACCESSID': 'LTAIWYUujJAm7CbH',
			
 
				+    'ACCESSKEY': 'RfSjdiWwED1sGFlsjXv0DlfTnZTG1P',
			
 
				+}
			
 
				+
			
 
				+sparse_features = [
			
 
				+    'cid', 'adid', 'adverid',
			
 
				+    'region', 'city', 'brand',
			
 
				+    'vid', 'cate1', 'cate2',
			
 
				+    "user_vid_return_tags_2h", "user_vid_return_tags_1d", "user_vid_return_tags_3d",
			
 
				+    "user_vid_return_tags_7d", "user_vid_return_tags_14d",
			
 
				+    "user_cid_click_list", "user_cid_conver_list",
			
 
				+    'apptype' ,'hour' ,'hour_quarter' ,'root_source_scene',
			
 
				+    'root_source_channel' ,'is_first_layer' ,'title_split' ,'profession',
			
 
				+    "user_vid_share_tags_1d", "user_vid_share_tags_14d", "user_vid_return_cate1_14d", "user_vid_return_cate2_14d", "user_vid_share_cate1_14d", "user_vid_share_cate2_14d",
			
 
				+    "creative_type", "user_has_conver_1y",
			
 
				+    "user_adverid_view_3d", "user_adverid_view_7d", "user_adverid_view_30d",
			
 
				+    "user_adverid_click_3d", "user_adverid_click_7d", "user_adverid_click_30d",
			
 
				+    "user_adverid_conver_3d", "user_adverid_conver_7d", "user_adverid_conver_30d",
			
 
				+    "user_skuid_view_3d", "user_skuid_view_7d", "user_skuid_view_30d",
			
 
				+    "user_skuid_click_3d", "user_skuid_click_7d", "user_skuid_click_30d",
			
 
				+    "user_skuid_conver_3d", "user_skuid_conver_7d", "user_skuid_conver_30d"
			
 
				+]
			
 
				+
			
 
				+int_features = [
			
 
				+    "user_has_conver_1y",
			
 
				+    "user_adverid_view_3d", "user_adverid_view_7d", "user_adverid_view_30d",
			
 
				+    "user_adverid_click_3d", "user_adverid_click_7d", "user_adverid_click_30d",
			
 
				+    "user_adverid_conver_3d", "user_adverid_conver_7d", "user_adverid_conver_30d",
			
 
				+    "user_skuid_view_3d", "user_skuid_view_7d", "user_skuid_view_30d",
			
 
				+    "user_skuid_click_3d", "user_skuid_click_7d", "user_skuid_click_30d",
			
 
				+    "user_skuid_conver_3d", "user_skuid_conver_7d", "user_skuid_conver_30d"
			
 
				+]
			
 
				+
			
 
				+def get_data():
			
 
				+    odps_conf = ODPS_CONFIG
			
 
				+    o = ODPS(odps_conf['ACCESSID'], odps_conf['ACCESSKEY'], 'loghubods',
			
 
				+             endpoint=odps_conf['ENDPOINT'])
			
 
				+    dense_features = open("features_top300.config").readlines()
			
 
				+    dense_features = [name.strip().lower() for name in dense_features]
			
 
				+    feature_names = ','.join(dense_features + sparse_features)
			
 
				+
			
 
				+    partitions = "dt in ('20250620')"
			
 
				+    sql = f''' SELECT {feature_names},has_conversion
			
 
				+           FROM loghubods.ad_easyrec_train_realtime_data_v3_sampled_temp
			
 
				+           WHERE {partitions} AND adverid = '598'
			
 
				+    '''
			
 
				+           # AND ts BETWEEN unix_timestamp('2025-05-14 17:40:00') AND unix_timestamp('2025-05-14 18:00:00')
			
 
				+    data_query_hash = hashlib.sha1(sql.encode("utf-8")).hexdigest()[0:8]
			
 
				+    cache_path = f'ad_data_cache_{data_query_hash}.parquet'
			
 
				+
			
 
				+    try:
			
 
				+        df = pd.read_parquet(cache_path)
			
 
				+    except:
			
 
				+        with o.execute_sql(sql).open_reader() as reader:
			
 
				+            df = reader.to_pandas()
			
 
				+            df.to_parquet(cache_path)
			
 
				+
			
 
				+    def detect_na_return(col):
			
 
				+        if str(df[col].dtype) in ('int64', 'float64') or col in int_features:
			
 
				+            return 0
			
 
				+        elif col in dense_features:
			
 
				+            return 0.0
			
 
				+        elif col in ('has_conversion', 'has_click'):
			
 
				+            return 0
			
 
				+        else:
			
 
				+            return ''
			
 
				+
			
 
				+    def handle_nulls(df):
			
 
				+        # 构建填充字典：数值列填0，非数值列填空字符串
			
 
				+        fill_dict = {
			
 
				+            col: detect_na_return(col) for col in df.columns
			
 
				+        }
			
 
				+        return df.fillna(fill_dict)
			
 
				+
			
 
				+    df = handle_nulls(df)
			
 
				+
			
 
				+    return df
			
 
				+
			
 
				+
			
 
				+ENDPOINT = '1894469520484605.cn-hangzhou.pai-eas.aliyuncs.com'
			
 
				+
			
 
				+TOKEN = 'ODI1MmUxODgzZDc3ODM0ZmQwZWU0YTVjZjdlOWVlMGFlZGJjNTlkYQ=='
			
 
				+SERV_NAME = 'ad_rank_dnn_v11_easyrec'
			
 
				+TOKEN = 'ZmUxOWY5OGYwYmFkZmU0ZGEyM2E4NTFkZjAwNGU0YWNmZTFhYTRhZg=='
			
 
				+SERV_NAME = 'ad_rank_dnn_v11_easyrec_test'
			
 
				+
			
 
				+DTYPE_TO_TF_TYPE = {
			
 
				+    'float64': TFRequest.DT_DOUBLE,
			
 
				+    'object': TFRequest.DT_STRING,
			
 
				+    'int64': TFRequest.DT_INT64
			
 
				+}
			
 
				+
			
 
				+def permutate_feature(df, column):
			
 
				+    df = df.copy()
			
 
				+    np.random.shuffle(df[column].values)
			
 
				+    return df
			
 
				+
			
 
				+def clear_feature(df, column):
			
 
				+    df = df.copy()
			
 
				+    dense_features = open("features_top300.config").readlines()
			
 
				+    dense_features = [name.strip().lower() for name in dense_features]
			
 
				+
			
 
				+    def detect_na_return(col):
			
 
				+        if df[col].dtype == 'int64':
			
 
				+            return 0
			
 
				+        elif df[col].dtype == 'float64':
			
 
				+            return 0.0
			
 
				+        elif col in dense_features:
			
 
				+            return 0.0
			
 
				+        elif col in ('has_conversion', 'has_click'):
			
 
				+            return 0
			
 
				+        else:
			
 
				+            return ''
			
 
				+
			
 
				+    zero_value = detect_na_return(column)
			
 
				+    df[column] = zero_value
			
 
				+    return df
			
 
				+
			
 
				+def build_req(df):
			
 
				+    feature_names = df.columns.tolist()
			
 
				+    batch_size = len(df)
			
 
				+    req = TFRequest('serving_default')
			
 
				+    for name in feature_names:
			
 
				+        dtype = str(df[name].dtype)
			
 
				+        tf_type = DTYPE_TO_TF_TYPE[dtype]
			
 
				+        values = df[name].tolist()
			
 
				+        if dtype == 'object':
			
 
				+            values = [bytes(x, 'utf-8') for x in values]
			
 
				+        req.add_feed(name, [batch_size], tf_type, values)
			
 
				+    req.add_fetch('probs')
			
 
				+    return req
			
 
				+
			
 
				+def predict_by_batches(df, batch_size = 512):
			
 
				+    n_samples = len(df)
			
 
				+    batch_num = (n_samples + batch_size - 1) // batch_size
			
 
				+    scores = []
			
 
				+    for i in range(batch_num):
			
 
				+        sub_df = df[i * batch_size : min(n_samples, (i + 1) * batch_size)]
			
 
				+        req = build_req(sub_df)
			
 
				+        resp = client.predict(req)
			
 
				+        scores.extend([x for x in resp.response.outputs['probs'].float_val])
			
 
				+    return scores
			
 
				+
			
 
				+def permutate_feature_and_predict(df):
			
 
				+    base_scores = client.predict(build_req(df)).response.outputs['probs'].float_val
			
 
				+    base_scores = np.array(base_scores)
			
 
				+    base_scores = base_scores / (base_scores + (1 - base_scores) / 0.04)
			
 
				+    label = df['has_conversion']
			
 
				+    base_auc = roc_auc_score(y_true=label, y_score=base_scores)
			
 
				+    ctcvr = np.sum(label) / len(label)
			
 
				+    print(f'avg base score: {np.average(base_scores):.6f}, auc: {base_auc:.6f}, ctcvr: {ctcvr:.6f}')
			
 
				+
			
 
				+    feature_to_test = df.columns
			
 
				+    feature_to_test = ['profession',]
			
 
				+
			
 
				+    for column in feature_to_test:
			
 
				+        new_df = clear_feature(df, column)
			
 
				+        scores = predict_by_batches(new_df)
			
 
				+        scores = [x / (x + (1 - x) / 0.04) for x in scores]
			
 
				+        scores = np.array(scores)
			
 
				+        avg_score = np.average(scores)
			
 
				+        avg_abs_diff = np.average(np.abs(scores - base_scores))
			
 
				+        avg_diff = np.average(scores - base_scores)
			
 
				+        new_auc = roc_auc_score(y_true=label, y_score=scores)
			
 
				+        auc_diff = new_auc - base_auc
			
 
				+        print(f'{column}\t{avg_score:.6f}\t{avg_diff:.6f}\t{avg_abs_diff:.6f}\t{auc_diff:.6f}')
			
 
				+
			
 
				+
			
 
				+def clear_feature_by_prefix_and_predict(df):
			
 
				+    feature_prefix_list = ["actionstatic","adid","adverid","apptype","b2","b3","b4","b5","b6","b7","b8","brand","cate1","cate2","cid","city","clickall","converall","cpa","creative","ctcvr","ctr","cvr","d1","e1","e2","ecpm","has","hour","incomeall","is","profession","region","root","timediff","title","user","vid","viewall"
			
 
				+]
			
 
				+    base_scores = client.predict(build_req(df)).response.outputs['probs'].float_val
			
 
				+    base_scores = np.array(base_scores)
			
 
				+    base_scores = base_scores / (base_scores + (1 - base_scores) / 0.04)
			
 
				+    label = df['has_conversion']
			
 
				+    try:
			
 
				+        base_auc = roc_auc_score(y_true=label, y_score=base_scores)
			
 
				+    except:
			
 
				+        base_auc = 0
			
 
				+    ctcvr = np.sum(label) / len(label)
			
 
				+    print(f'avg base score: {np.average(base_scores):.6f}, auc: {base_auc:.6f}, ctcvr: {ctcvr:.6f}')
			
 
				+
			
 
				+    for prefix in feature_prefix_list:
			
 
				+        new_df = df
			
 
				+        columns_to_clear = [col for col in df.columns if col.startswith(prefix)]
			
 
				+        for column in columns_to_clear:
			
 
				+            new_df = clear_feature(new_df, column)
			
 
				+        scores = predict_by_batches(new_df)
			
 
				+        scores = [x / (x + (1 - x) / 0.04) for x in scores]
			
 
				+        scores = np.array(scores)
			
 
				+        avg_score = np.average(scores)
			
 
				+        avg_abs_diff = np.average(np.abs(scores - base_scores))
			
 
				+        avg_diff = np.average(scores - base_scores)
			
 
				+        try:
			
 
				+            new_auc = roc_auc_score(y_true=label, y_score=scores)
			
 
				+        except:
			
 
				+            new_auc = 0
			
 
				+        auc_diff = new_auc - base_auc
			
 
				+        print(f'{prefix}\t{avg_score:.6f}\t{avg_diff:.6f}\t{avg_abs_diff:.6f}\t{auc_diff:.6f}')
			
 
				+
			
 
				+def clear_feature_and_predict(df):
			
 
				+    base_scores = predict_by_batches(df)
			
 
				+    base_scores = np.array(base_scores)
			
 
				+    # base_scores = base_scores / (base_scores + (1 - base_scores) / 0.04)
			
 
				+    # print(base_scores)
			
 
				+    label = df['has_conversion']
			
 
				+    ctcvr = np.sum(label) / len(label)
			
 
				+    try:
			
 
				+        base_auc = roc_auc_score(y_true=label, y_score=base_scores)
			
 
				+    except:
			
 
				+        base_auc = 0
			
 
				+    print(f'avg base score: {np.average(base_scores):.6f}, auc: {base_auc:.6f}, ctcvr: {ctcvr:.6f}')
			
 
				+
			
 
				+    feature_to_test = [x.lower().strip() for x in open('features_top50.config').readlines()]
			
 
				+    return
			
 
				+    # feature_to_test = sparse_features
			
 
				+
			
 
				+    all_clean_df = df.copy()
			
 
				+    for column in feature_to_test:
			
 
				+        all_clean_df = clear_feature(all_clean_df, column)
			
 
				+    # score = client.predict(build_req(all_clean_df)).response.outputs['probs'].float_val
			
 
				+    score = predict_by_batches(all_clean_df)
			
 
				+    score = np.array(score)
			
 
				+    score = score / (score + (1 - score) / 0.04)
			
 
				+
			
 
				+    for column in feature_to_test:
			
 
				+        new_df = clear_feature(df, column)
			
 
				+        scores = predict_by_batches(new_df)
			
 
				+        scores = [x / (x + (1 - x) / 0.04) for x in scores]
			
 
				+        scores = np.array(scores)
			
 
				+        avg_score = np.average(scores)
			
 
				+        avg_abs_diff = np.average(np.abs(scores - base_scores))
			
 
				+        avg_diff = np.average(scores - base_scores)
			
 
				+        try:
			
 
				+            new_auc = roc_auc_score(y_true=label, y_score=scores)
			
 
				+        except:
			
 
				+            new_auc = 0
			
 
				+        auc_diff = new_auc - base_auc
			
 
				+        print(f'{column:20}\t{avg_score:.6f}\t{avg_diff:.6f}\t{avg_abs_diff:.6f}\t{auc_diff:.6f}')
			
 
				+        # df_to_draw = pd.DataFrame({
			
 
				+        #     'score': scores,
			
 
				+        #     'label': label
			
 
				+        # })
			
 
				+        # draw_figures(df_to_draw, column, 0.04,
			
 
				+        #              filename=f'plots/feature_q_plot_{column}.png')
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    client = PredictClient(ENDPOINT, SERV_NAME)
			
 
				+    client.set_token(TOKEN)
			
 
				+    client.init()
			
 
				+
			
 
				+    df = get_data()
			
 
				+    # df = df.query('user_vid_return_tags_3d.str.len() > 1')
			
 
				+    # df['user_vid_return_tags_3d'] = ''
			
 
				+    # pd.set_option('display.max_rows', None)
			
 
				+    df['vid'] = df['vid'].apply(lambda x: int(x))
			
 
				+    df['cid'] = df['cid'].apply(lambda x: int(x))
			
 
				+    df['adid'] = df['adid'].apply(lambda x: int(x))
			
 
				+    df['adverid'] = df['adverid'].apply(lambda x: int(x))
			
 
				+    for feature in int_features:
			
 
				+        df[feature] = df[feature].apply(lambda x: int(x))
			
 
				+    if len(df) == 0:
			
 
				+        print("empty df")
			
 
				+        sys.exit(0)
			
 
				+    print(f'df size: {len(df)}')
			
 
				+
			
 
				+    # print(df)
			
 
				+    # print(df[['vid', 'cid', 'adid', 'adverid', 'apptype', 'hour', 'hour_quarter', 'is_first_layer']])
			
 
				+    # clear_feature_and_predict(df)
			
 
				+    # permutate_feature_and_predict(df)
			
 
				+    clear_feature_by_prefix_and_predict(df)
			
 
				+
			
--- a/q_plot_tool.py
+++ b/q_plot_tool.py
@@ -0,0 +1,41 @@
 
				+#! /usr/bin/env python
			
 
				+# -*- coding: utf-8 -*-
			
 
				+# vim:fenc=utf-8
			
 
				+#
			
 
				+# Copyright © 2025 StrayWarrior <i@straywarrior.com>
			
 
				+
			
 
				+import matplotlib.pyplot as plt
			
 
				+import numpy as np
			
 
				+import pandas as pd
			
 
				+
			
 
				+plt.rcParams['font.sans-serif'] = ['Songti SC']
			
 
				+
			
 
				+def draw_figures(df, plot_name, sample_rate=1, filename=None):
			
 
				+    num_bins = 20
			
 
				+    df['p_bin'], _ = pd.qcut(df['score'], q=num_bins, duplicates='drop', retbins=True)
			
 
				+    quantile_data = df.groupby('p_bin').agg(
			
 
				+        mean_p=('score', 'mean'),
			
 
				+        mean_y=('label', 'mean')
			
 
				+    ).reset_index()
			
 
				+    
			
 
				+    ctr = quantile_data['mean_y']
			
 
				+    actual_quantiles = ctr / (ctr + (1 - ctr) / sample_rate)
			
 
				+
			
 
				+    pctr = quantile_data['mean_p']
			
 
				+    predicted_quantiles = pctr / (pctr + (1 - pctr) / sample_rate)
			
 
				+
			
 
				+    plt.figure(figsize=(6, 6))
			
 
				+    plt.plot(predicted_quantiles, actual_quantiles, ms=3, ls='-', color='blue', label='old')
			
 
				+    plt.plot([0, 1], [0, 1], color='gray', linestyle='--', label='Ideal Line')
			
 
				+    axis_max = max(predicted_quantiles.tolist()[-1],
			
 
				+                   actual_quantiles.tolist()[-1])
			
 
				+    plt.xlim(0, axis_max)
			
 
				+    plt.ylim(0, axis_max)
			
 
				+    plt.xlabel('Predicted pCTR')
			
 
				+    plt.ylabel('Actual CTR')
			
 
				+    plt.title('Q-Q Plot for pCTR Calibration %s' % (plot_name))
			
 
				+    plt.grid(True)
			
 
				+    if filename:
			
 
				+        plt.savefig(filename)
			
 
				+    else:
			
 
				+        plt.show()
			
--- a/widedeep_v13_1.py
+++ b/widedeep_v13_1.py
@@ -0,0 +1,262 @@
 
				+#! /usr/bin/env python
			
 
				+# -*- coding: utf-8 -*-
			
 
				+# vim:fenc=utf-8
			
 
				+#
			
 
				+# Copyright © 2025 StrayWarrior <i@straywarrior.com>
			
 
				+#
			
 
				+# Distributed under terms of the MIT license.
			
 
				+
			
 
				+"""
			
 
				+1.删除容易导致偏差的viewall特征
			
 
				+2.删除分桶不均匀的cpa特征
			
 
				+3.减少dense特征
			
 
				+4.增加U-I交叉统计
			
 
				+5.增加线性部分dense
			
 
				+"""
			
 
				+
			
 
				+
			
 
				+raw_input = open("data_fields_v3.config").readlines()
			
 
				+input_fields = dict(
			
 
				+    map(lambda x: (x[0], x[1]),
			
 
				+        map(lambda x: x.strip().split(' '), raw_input)))
			
 
				+
			
 
				+def read_features(filename, excludes=None):
			
 
				+    features = open(filename).readlines()
			
 
				+    features = [name.strip().lower() for name in features]
			
 
				+    if excludes:
			
 
				+        for x in excludes:
			
 
				+            if x in features:
			
 
				+                features.remove(x)
			
 
				+    return features
			
 
				+
			
 
				+exclude_features = ['viewall', 'cpa']
			
 
				+
			
 
				+dense_features = read_features("features_top300.config", exclude_features)
			
 
				+top_dense_features = read_features('features_top50.config', exclude_features)
			
 
				+
			
 
				+sparse_features = [
			
 
				+    "cid", "adid", "adverid",
			
 
				+    "region", "city", "brand",
			
 
				+    "vid", "cate1", "cate2",
			
 
				+    "apptype", "hour", "hour_quarter", "root_source_scene", "root_source_channel", "is_first_layer", "title_split", "user_has_conver_1y",
			
 
				+    "user_adverid_view_3d", "user_adverid_view_7d", "user_adverid_view_30d",
			
 
				+    "user_adverid_click_3d", "user_adverid_click_7d", "user_adverid_click_30d",
			
 
				+    "user_adverid_conver_3d", "user_adverid_conver_7d", "user_adverid_conver_30d",
			
 
				+    "user_skuid_view_3d", "user_skuid_view_7d", "user_skuid_view_30d",
			
 
				+    "user_skuid_click_3d", "user_skuid_click_7d", "user_skuid_click_30d",
			
 
				+    "user_skuid_conver_3d", "user_skuid_conver_7d", "user_skuid_conver_30d"
			
 
				+]
			
 
				+tag_features = [
			
 
				+    "user_vid_return_tags_2h", "user_vid_return_tags_1d", "user_vid_return_tags_3d",
			
 
				+    "user_vid_return_tags_7d", "user_vid_return_tags_14d"
			
 
				+]
			
 
				+seq_features = [
			
 
				+    "user_cid_click_list", "user_cid_conver_list"
			
 
				+]
			
 
				+
			
 
				+input_type_map = {
			
 
				+    'BIGINT': 'INT64',
			
 
				+    'DOUBLE': 'DOUBLE',
			
 
				+    'STRING': 'STRING'
			
 
				+}
			
 
				+
			
 
				+print("""train_config {
			
 
				+  optimizer_config {
			
 
				+    adam_optimizer {
			
 
				+      learning_rate {
			
 
				+        constant_learning_rate {
			
 
				+          learning_rate: 0.0010
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+    use_moving_average: false
			
 
				+  }
			
 
				+  optimizer_config {
			
 
				+    adam_optimizer {
			
 
				+      learning_rate {
			
 
				+        constant_learning_rate {
			
 
				+          learning_rate: 0.0006
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+    use_moving_average: false
			
 
				+  }
			
 
				+  optimizer_config {
			
 
				+    adam_optimizer {
			
 
				+      learning_rate {
			
 
				+        constant_learning_rate {
			
 
				+          learning_rate: 0.002
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+    use_moving_average: false
			
 
				+  }
			
 
				+  num_steps: 200000
			
 
				+  sync_replicas: true
			
 
				+  save_checkpoints_steps: 1100
			
 
				+  log_step_count_steps: 100
			
 
				+  save_summary_steps: 100
			
 
				+}
			
 
				+eval_config {
			
 
				+  metrics_set {
			
 
				+    auc {
			
 
				+    }
			
 
				+  }
			
 
				+  eval_online: true
			
 
				+  eval_interval_secs: 120
			
 
				+}
			
 
				+data_config {
			
 
				+  batch_size: 512
			
 
				+  num_epochs: 1
			
 
				+""")
			
 
				+
			
 
				+for name in input_fields:
			
 
				+    input_type = input_type_map[input_fields[name]]
			
 
				+    default_spec = ''
			
 
				+    if name in dense_features:
			
 
				+        default_spec = '\n    default_val: "0"'
			
 
				+    print(f"""  input_fields {{
			
 
				+    input_name: "{name}"
			
 
				+    input_type: {input_type}{default_spec}
			
 
				+  }}""")
			
 
				+    # default_val: "0"
			
 
				+
			
 
				+print("""  label_fields: "has_conversion"
			
 
				+  prefetch_size: 32
			
 
				+  input_type: OdpsInputV2
			
 
				+}
			
 
				+""")
			
 
				+
			
 
				+for name in dense_features:
			
 
				+    print(f"""feature_configs {{
			
 
				+  input_names: "{name}"
			
 
				+  feature_type: RawFeature
			
 
				+  boundaries: [0.0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2, 0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3, 0.31, 0.32, 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4, 0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5, 0.51, 0.52, 0.53, 0.54, 0.55, 0.56, 0.57, 0.58, 0.59, 0.6, 0.61, 0.62, 0.63, 0.64, 0.65, 0.66, 0.67, 0.68, 0.69, 0.7, 0.71, 0.72, 0.73, 0.74, 0.75, 0.76, 0.77, 0.78, 0.79, 0.8, 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87, 0.88, 0.89, 0.9, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99, 1.0]
			
 
				+  embedding_dim: 6
			
 
				+}}""")
			
 
				+
			
 
				+for name in sparse_features:
			
 
				+    print(f"""feature_configs {{
			
 
				+  input_names: "{name}"
			
 
				+  feature_type: IdFeature
			
 
				+  hash_bucket_size: 1000000
			
 
				+  embedding_dim: 6
			
 
				+}}""")
			
 
				+
			
 
				+for name in tag_features + seq_features:
			
 
				+    print(f"""feature_configs {{
			
 
				+  input_names: "{name}"
			
 
				+  feature_type: TagFeature
			
 
				+  hash_bucket_size: 1000000
			
 
				+  embedding_dim: 6
			
 
				+  separator: ','
			
 
				+}}""")
			
 
				+
			
 
				+
			
 
				+def wide_and_deep():
			
 
				+    print("""
			
 
				+model_config {
			
 
				+  model_class: "WideAndDeep"
			
 
				+   feature_groups: {
			
 
				+    group_name: 'wide'""")
			
 
				+
			
 
				+    for name in dense_features + sparse_features:
			
 
				+        print(f"""    feature_names: '{name}'""")
			
 
				+
			
 
				+    print("""    wide_deep: WIDE
			
 
				+  }
			
 
				+  feature_groups: {
			
 
				+    group_name: 'deep'""")
			
 
				+
			
 
				+    for name in dense_features + sparse_features + tag_features + seq_features:
			
 
				+        print(f"""    feature_names: '{name}'""")
			
 
				+
			
 
				+    print("""    wide_deep: DEEP
			
 
				+  }
			
 
				+  wide_and_deep {
			
 
				+    wide_output_dim: 8
			
 
				+
			
 
				+    dnn {
			
 
				+      hidden_units: [256, 128, 64]
			
 
				+    }
			
 
				+    final_dnn {
			
 
				+      hidden_units: [64, 32]
			
 
				+    }
			
 
				+    l2_regularization: 1e-5
			
 
				+  }
			
 
				+  embedding_regularization: 1e-6
			
 
				+}""")
			
 
				+
			
 
				+
			
 
				+def deep_fm():
			
 
				+    print("""
			
 
				+model_config {
			
 
				+  model_class: "DeepFM"
			
 
				+   feature_groups: {
			
 
				+    group_name: 'wide'""")
			
 
				+
			
 
				+    for name in dense_features + sparse_features:
			
 
				+        print(f"""    feature_names: '{name}'""")
			
 
				+
			
 
				+    print("""    wide_deep: WIDE
			
 
				+  }
			
 
				+  feature_groups: {
			
 
				+    group_name: 'deep'""")
			
 
				+
			
 
				+    for name in top_dense_features + sparse_features + tag_features + seq_features:
			
 
				+        print(f"""    feature_names: '{name}'""")
			
 
				+
			
 
				+    print("""    wide_deep: DEEP
			
 
				+  }
			
 
				+  deepfm {
			
 
				+    wide_output_dim: 8
			
 
				+
			
 
				+    dnn {
			
 
				+      hidden_units: [256, 128, 64]
			
 
				+    }
			
 
				+
			
 
				+    final_dnn {
			
 
				+      hidden_units: [64, 32]
			
 
				+    }
			
 
				+    l2_regularization: 1e-5
			
 
				+  }
			
 
				+  embedding_regularization: 1e-6
			
 
				+}""")
			
 
				+
			
 
				+
			
 
				+def fm():
			
 
				+    print("""
			
 
				+model_config {
			
 
				+  model_class: "FM"
			
 
				+   feature_groups: {
			
 
				+    group_name: 'wide'""")
			
 
				+
			
 
				+    for name in dense_features:
			
 
				+        print(f"""    feature_names: '{name}'""")
			
 
				+
			
 
				+    print("""    wide_deep: WIDE
			
 
				+  }
			
 
				+  feature_groups: {
			
 
				+    group_name: 'deep'""")
			
 
				+
			
 
				+    for name in dense_features:
			
 
				+        print(f"""    feature_names: '{name}'""")
			
 
				+
			
 
				+    print("""    wide_deep: DEEP
			
 
				+  }
			
 
				+  fm {
			
 
				+  }
			
 
				+  embedding_regularization: 1e-5
			
 
				+}""")
			
 
				+
			
 
				+
			
 
				+def config_export():
			
 
				+    print("""
			
 
				+export_config {
			
 
				+  exporter_type: "final"
			
 
				+}
			
 
				+""")
			
 
				+
			
 
				+
			
 
				+deep_fm()
			
 
				+config_export()
Автор	SHA1 Опис	Дата
StrayWarrior	aab1a0dc09 Add widedeep_v13_1	2 тижнів тому
StrayWarrior	a5c70abc8a Add some debug tools	2 тижнів тому