ソースを参照

feat:添加自动化供给监控

zhaohaipeng 1 ヶ月 前
コミット
ee9f1686e6

+ 0 - 65
XGB/check_data.py

@@ -1,65 +0,0 @@
-# import pandas as pd
-#
-# old_date_train = f"/Users/zhao/Desktop/Code/Python/model_monitor/XGB/data/all/20241012_predict.csv"
-# new_date_train = f"/Users/zhao/Desktop/Code/Python/model_monitor/XGB/20241012_predict_1.csv"
-#
-# # 读取两个 CSV 文件
-# old_df = pd.read_csv(old_date_train)
-# new_df = pd.read_csv(new_date_train)
-#
-# if old_df.shape[0] != new_df.shape[0]:
-#     print(f"新老训练数据集长度不一样 新数据集: {new_df.shape[0]}, 老数据集: {old_df.shape[0]}")
-#
-# old_df_col = old_df.columns
-# new_df_col = new_df.columns
-# if len(old_df_col) != len(new_df_col):
-#     print(f"两个文件列数不一样 新文件: {new_df_col}, 老文件: {old_df_col}")
-#
-# for col in old_df_col:
-#     if col not in new_df_col:
-#         print(f"列 {col} 在老文件存在,新文件不存在")
-#
-# for col in new_df_col:
-#     if col not in old_df_col:
-#         print(f"列 {col} 在新文件存在,老文件不存在")
-#
-# old_df.set_index("vid", inplace=True)
-# new_df.set_index("vid", inplace=True)
-#
-# old_dict = old_df.to_dict(orient="index")
-# new_dict = new_df.to_dict(orient="index")
-#
-# for e in new_dict:
-#     if e not in old_dict:
-#         print(f"vid {e} 在新文件中存在,在老文件中不存在")
-#     new_row = new_dict[e]
-#     old_row = old_dict[e]
-#     for col in new_df_col:
-#         if col in ['vid', '曝光占比', '分子', '分母', 'label']:
-#             continue
-#         if col not in old_row:
-#             print(f"vid {e} 的列 {col} 在老文件中不存在")
-#             continue
-#         # if col in new_row:
-#         #     print(f"vid {e} 的列 {col} 在新文件中不存在")
-#         #     continue
-#         if old_row[col] != new_row[col]:
-#             print(f"vid {e} 列 {col} 的值在新老文件不一样, 新文件的值: {new_row[col]}, 老文件的值: {old_row[col]}")
-#
-# # z_vid = set()
-# # with open("/Users/zhao/Desktop/Code/Python/rov-offline/write_redis/filtered_vid", "r") as f:
-# #     for line in f:
-# #         z_vid.add(line.replace("\n", ""))
-# #
-# # p_vid = set()
-# # with open("./filtered_vid.txt", "r") as f:
-# #     for line in f:
-# #         p_vid.add(line.replace("\n", ""))
-# #
-# # for e in z_vid:
-# #     if e not in p_vid:
-# #         print(f"VID: {e} 离线预测有,在线预测没有")
-# #
-# # for e in p_vid:
-# #     if e not in z_vid:
-# #         print(f"VID: {e} 在线预测有,离线预测没有")

+ 0 - 1
XGB/file/readme.txt

@@ -1 +0,0 @@
-用于存放XGB模型的一些过程文件

+ 0 - 446
XGB/vov_xgboost_train.py

@@ -1,446 +0,0 @@
-import concurrent.futures
-import json
-import logging
-from datetime import datetime, timedelta
-
-import numpy as np
-import pandas as pd
-import xgboost as xgb
-
-from client import ODPSClient
-from config import ConfigManager
-from util import feishu_inform_util
-
-odps_client = ODPSClient.ODPSClient()
-config_manager = ConfigManager.ConfigManager()
-
-features_name = ['1_vov0', '2_vov0', '3_vov0', '4_vov0', '5_vov0', '2_vov01', '3_vov01', '4_vov01', '5_vov01',
-                 '3_vov012', '4_vov012', '5_vov012', "12_change", "23_change", "34_change", '2_vov01', '3_vov01',
-                 '4_vov01', '5_vov01', '3_vov012', '4_vov012', '5_vov012']
-
-column_names = ['曝光占比', 'vov0', '分子', '分母', '1_vov0', '2_vov0', '3_vov0', '4_vov0', '5_vov0', '2_vov01',
-                '3_vov01', '4_vov01', '5_vov01', '3_vov012', '4_vov012', '5_vov012', '1_vov0_分子', '1_vov0_分母',
-                '2_vov0_分子', '2_vov0_分母', '3_vov0_分子', '3_vov0_分母', '4_vov0_分子', '4_vov0_分母',
-                '5_vov0_分子', '5_vov0_分母', '2_vov01_分子', '2_vov01_分母', '3_vov01_分子', '3_vov01_分母',
-                '4_vov01_分子', '4_vov01_分母', '5_vov01_分子', '5_vov01_分母', '3_vov012_分子', '3_vov012_分母',
-                '4_vov012_分子', '4_vov012_分母', '5_vov012_分子', '5_vov012_分母']
-
-# 创建一个logger
-logger = logging.getLogger("vov_xgboost_train.py")
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
-
-
-def get_partition_df(table, dt):
-    logger.info(f"开始下载: {table} -- {dt} 的数据")
-
-    download_session = odps_client.get_download_session(table, dt)
-    logger.info(f"表: {table} 中的分区 {dt}, 共有 {download_session.count} 条数据")
-
-    with download_session.open_arrow_reader(0, download_session.count) as reader:
-        # 将所有数据加载到 DataFrame 中
-        df = pd.concat([batch.to_pandas() for batch in reader])
-
-    logger.info(f"下载结束: {table} -- {dt} 的数据, 共计 {df.shape[0]} 条数据")
-    return df
-
-
-def fetch_label_data(label_datetime: datetime):
-    """
-    获取 label数据
-    :return:
-    """
-    label_dt = label_datetime.strftime("%Y%m%d")
-    logger.info(f"fetch_label_data.dt: {label_dt}")
-
-    # 获取数据
-    label_df = get_partition_df("alg_vid_vov_new", label_dt)
-    extracted_data = [
-        {
-            'vid': int(row['vid']),
-        }
-        for _, row in label_df.iterrows()
-    ]
-    # 构造新的 DataFrame
-    applied_df = pd.DataFrame(extracted_data)
-    # 添加 title 列
-    applied_df['title'] = "title"
-
-    return applied_df
-
-
-def fetch_view_rate_data(view_date: datetime):
-    """
-    获取曝光数据
-    :return:
-    """
-    view_rate_dt = view_date.strftime("%Y%m%d")
-    logger.info(f"fetch_view_rate_date.dt: {view_rate_dt}")
-    try:
-        # 获取数据
-        view_rate_df = get_partition_df("alg_vid_vov_new", view_rate_dt)
-        extracted_data = [
-            {
-                'vid': int(row['vid']),
-                '分母': int(feature['1_vov0_分母']),
-                '分子': feature['1_vov0_分子'],
-                'vov0': feature['1_vov0']
-            }
-            for _, row in view_rate_df.iterrows()
-            if (feature := json.loads(row['feature']))
-        ]
-        # 构造新的 DataFrame
-        applied_df = pd.DataFrame(extracted_data)
-        # 计算曝光占比,矢量化操作
-        view_sum = applied_df['分母'].sum()
-        applied_df['曝光占比'] = applied_df['分母'] / view_sum
-
-        return applied_df
-    except Exception as e:
-        return pd.DataFrame({
-            "vid": [-1],
-            "分母": [0],
-            "分子": [0],
-            "vov0": [0],
-            "曝光占比": [0]
-        })
-
-
-def fetch_feature_data_dt(dt: str, index):
-    """
-    查询某一天的特征数据,方便做特征数据时并行处理
-    :param dt:
-    :param index:
-    :return:
-    """
-
-    logger.info(f"fetch_feature_data_dt.dt -- {dt} 的数据")
-
-    df = get_partition_df("videoid_vov_base_data", dt).fillna(0)
-
-    today_dist_view_pv = df['today_dist_view_pv'].astype(int)
-    today_return_to_dist_view_pv = df['today_return_to_dist_view_pv'].astype(int)
-    day1_return_to_dist_view_pv = df['day1_return_to_dist_view_pv'].astype(int)
-    day2_return_to_dist_view_pv = df['day2_return_to_dist_view_pv'].astype(int)
-
-    # all_return_to_dist_view_pv
-    t_1_all_return_to_dist_view_pv = today_return_to_dist_view_pv + day1_return_to_dist_view_pv
-    t_2_all_return_to_dist_view_pv = t_1_all_return_to_dist_view_pv + day2_return_to_dist_view_pv
-
-    # all_vov
-    t_0_all_vov = today_return_to_dist_view_pv / today_dist_view_pv.where(today_dist_view_pv > 0, 1)
-    t_0_all_vov = t_0_all_vov.where(today_dist_view_pv > 0, 0)
-
-    t_1_all_vov = t_1_all_return_to_dist_view_pv / today_dist_view_pv.where(today_dist_view_pv > 0, 1)
-    t_1_all_vov = t_1_all_vov.where(today_dist_view_pv > 0, 0)
-
-    t_2_all_vov = t_2_all_return_to_dist_view_pv / today_dist_view_pv.where(today_dist_view_pv > 0, 1)
-    t_2_all_vov = t_2_all_vov.where(today_dist_view_pv > 0, 0)
-
-    # 构造结果DataFrame
-    result_df = pd.DataFrame({
-        'vid': df['videoid'],
-
-        f'{index}_vov0': t_0_all_vov,
-        f'{index}_vov0_分子': today_return_to_dist_view_pv,
-        f'{index}_vov0_分母': today_dist_view_pv,
-
-        f'{index}_vov01': t_1_all_vov,
-        f'{index}_vov01_分子': t_1_all_return_to_dist_view_pv,
-        f'{index}_vov01_分母': today_dist_view_pv,
-
-        f'{index}_vov012': t_2_all_vov,
-        f'{index}_vov012_分子': t_2_all_return_to_dist_view_pv,
-        f'{index}_vov012_分母': today_dist_view_pv,
-    })
-    logger.info(f"完成处理 videoid_vov_base_data -- {dt} 的数据")
-
-    return result_df
-
-
-def fetch_feature_data(t_1_datetime: datetime):
-    """
-    获取feature数据
-    :return:
-    """
-
-    with concurrent.futures.ThreadPoolExecutor(5) as executor:
-        t_1_feature_task = executor.submit(
-            fetch_feature_data_dt, t_1_datetime.strftime("%Y%m%d"), 1
-        )
-        t_2_datetime = t_1_datetime - timedelta(days=1)
-        t_2_feature_task = executor.submit(
-            fetch_feature_data_dt, t_2_datetime.strftime("%Y%m%d"), 2
-        )
-        t_3_datetime = t_1_datetime - timedelta(days=2)
-        t_3_feature_task = executor.submit(
-            fetch_feature_data_dt, t_3_datetime.strftime("%Y%m%d"), 3
-        )
-        t_4_datetime = t_1_datetime - timedelta(days=3)
-        t_4_feature_task = executor.submit(
-            fetch_feature_data_dt, t_4_datetime.strftime("%Y%m%d"), 4
-        )
-        t_5_datetime = t_1_datetime - timedelta(days=4)
-        t_5_feature_task = executor.submit(
-            fetch_feature_data_dt, t_5_datetime.strftime("%Y%m%d"), 5
-        )
-
-        logger.info(
-            f"fetch_feature_data:"
-            f"\t t_1_feature_task.datetime: {t_1_datetime.strftime('%Y%m%d')}"
-            f"\t t_2_feature_task.datetime: {t_2_datetime.strftime('%Y%m%d')}"
-            f"\t t_3_feature_task.datetime: {t_3_datetime.strftime('%Y%m%d')}"
-            f"\t t_4_feature_task.datetime: {t_4_datetime.strftime('%Y%m%d')}"
-            f"\t t_5_feature_task.datetime: {t_5_datetime.strftime('%Y%m%d')}"
-        )
-
-        t_1_feature = t_1_feature_task.result()
-        t_2_feature = t_2_feature_task.result()
-        t_3_feature = t_3_feature_task.result()
-        t_4_feature = t_4_feature_task.result()
-        t_5_feature = t_5_feature_task.result()
-
-        t_1_feature = t_1_feature[['vid', "1_vov0", "1_vov0_分子", "1_vov0_分母"]]
-        t_2_feature = t_2_feature[
-            ['vid', "2_vov0", "2_vov0_分子", "2_vov0_分母", "2_vov01", "2_vov01_分子", "2_vov01_分母"]
-        ]
-
-    return t_1_feature, t_2_feature, t_3_feature, t_4_feature, t_5_feature
-
-
-def fetch_data(label_datetime: datetime, feature_start_datetime: datetime, view_rate_datetime: datetime):
-    with concurrent.futures.ThreadPoolExecutor(3) as executor:
-        label_future = executor.submit(fetch_label_data, label_datetime)
-        feature_future = executor.submit(fetch_feature_data, feature_start_datetime)
-        view_rate_future = executor.submit(fetch_view_rate_data, view_rate_datetime)
-
-        label_apply_df = label_future.result()
-        t_1_feature, t_2_feature, t_3_feature, t_4_feature, t_5_feature = feature_future.result()
-        view_rate = view_rate_future.result()
-
-        df = (pd.merge(label_apply_df, view_rate, on="vid", how='left')
-              .merge(t_1_feature, on="vid", how='left')
-              .merge(t_2_feature, on="vid", how='left')
-              .merge(t_3_feature, on="vid", how='left')
-              .merge(t_4_feature, on="vid", how='left')
-              .merge(t_5_feature, on="vid", how='left')
-              )
-        df.fillna(0, inplace=True)
-        df.sort_values(by=['曝光占比'], ascending=False, inplace=True)
-
-        for col in column_names:
-            df[col] = pd.to_numeric(df[col], errors='coerce')
-
-        df["12_change"] = df["1_vov0"] - df["2_vov0"]
-        df["23_change"] = df["2_vov0"] - df["3_vov0"]
-        df["34_change"] = df["3_vov0"] - df["4_vov0"]
-
-        df["label"] = df["vov0"].apply(lambda x: 1 if x > 0.25 else 0)
-
-    return df
-
-
-def xgb_train_multi_dt_data(t_1_label_dt: datetime):
-    """
-    XGB模型多天训练数据
-    :param t_1_label_dt:
-    :return:
-    """
-    with concurrent.futures.ThreadPoolExecutor(3) as executor:
-        t_1_feature_dt = t_1_label_dt - timedelta(2)
-        logger.info(
-            f"VOV模型特征数据处理 --- t_1_label_future:"
-            f"\t label_datetime: {t_1_label_dt.strftime('%Y%m%d')} "
-            f"\t feature_datetime: {t_1_feature_dt.strftime('%Y%m%d')} "
-            f"\t view_rate_datetime: {t_1_label_dt.strftime('%Y%m%d')} "
-        )
-        t_1_label_future = executor.submit(fetch_data, t_1_label_dt, t_1_feature_dt, t_1_label_dt)
-
-        t_2_label_dt = t_1_label_dt - timedelta(1)
-        t_2_feature_dt = t_2_label_dt - timedelta(1)
-        logger.info(
-            f"VOV模型特征数据处理 --- t_2_label_future:"
-            f"\t label_datetime: {t_2_label_dt.strftime('%Y%m%d')} "
-            f"\t feature_datetime: {t_2_feature_dt.strftime('%Y%m%d')} "
-            f"\t view_rate_datetime: {t_2_label_dt.strftime('%Y%m%d')} "
-        )
-        t_2_label_future = executor.submit(fetch_data, t_2_label_dt, t_2_feature_dt, t_2_label_dt)
-
-        t_3_label_dt = t_1_label_dt - timedelta(2)
-        t_3_feature_dt = t_3_label_dt - timedelta(1)
-        logger.info(
-            f"VOV模型特征数据处理 --- t_3_label_future:"
-            f"\t label_datetime: {t_3_label_dt.strftime('%Y%m%d')} "
-            f"\t feature_datetime: {t_3_feature_dt.strftime('%Y%m%d')} "
-            f"\t view_rate_datetime: {t_3_label_dt.strftime('%Y%m%d')} "
-        )
-        t_3_label_future = executor.submit(fetch_data, t_3_label_dt, t_3_feature_dt, t_3_label_dt)
-
-        t_1_label_df = t_1_label_future.result()
-        t_2_label_df = t_2_label_future.result()
-        t_3_label_df = t_3_label_future.result()
-
-    return pd.concat([t_1_label_df, t_2_label_df, t_3_label_df], ignore_index=True)
-
-
-def xgb_predict_dt_data(label_datetime: datetime):
-    """
-    获取预估数据
-    :param label_datetime:
-    :return:
-    """
-    feature_start_datetime = label_datetime
-    view_rate_datetime = label_datetime + timedelta(2)
-    logger.info(
-        f"VOV模型预测数据处理 --- predict_df: "
-        f"\t label_datetime: {label_datetime.strftime('%Y%m%d')} "
-        f"\t feature_datetime: {feature_start_datetime.strftime('%Y%m%d')} "
-        f"\t view_rate_datetime: {view_rate_datetime.strftime('%Y%m%d')} "
-    )
-    return fetch_data(label_datetime, feature_start_datetime, view_rate_datetime)
-
-
-def _main():
-    logger.info(f"XGB模型训练")
-    train_df = xgb_train_multi_dt_data((datetime.now() - timedelta(days=4)))
-    trains_array = train_df[features_name].values
-    trains_label_array = train_df['label'].values
-
-    logger.info(f"特征获取完成,开始训练。 训练使用的数据量: {train_df.shape[0]}")
-    model = xgb.XGBClassifier(
-        n_estimators=1000,
-        learning_rate=0.01,
-        max_depth=5,
-        min_child_weight=1,
-        gamma=0,
-        subsample=0.8,
-        colsample_bytree=0.8,
-        objective='binary:logistic',
-        nthread=8,
-        scale_pos_weight=1,
-        random_state=2024,
-        seed=2024,
-    )
-    model.fit(trains_array, trains_label_array)
-
-    logger.info("获取评测数据")
-    predict_df = xgb_predict_dt_data((datetime.now() - timedelta(days=3)))
-    tests_array = predict_df[features_name].values
-    y_pred = model.predict_proba(tests_array)[:, 1]
-    predict_df["y_pred"] = y_pred
-    condition_choose = (
-            (predict_df['y_pred'] <= 0.1) &
-            (
-                    (predict_df['2_vov0_分母'] > 50) |
-                    (predict_df['3_vov0_分母'] > 50) |
-                    (predict_df['4_vov0_分母'] > 50)
-            ) &
-            (
-                (predict_df['1_vov0'] - predict_df['2_vov0'] < 0.1)
-            )
-    )
-    profit_threshold = 0.3
-    condition_choose_real = condition_choose & (predict_df['vov0'] <= profit_threshold)
-    predict_df["condition_choose"] = condition_choose
-    predict_df[["vid", "曝光占比", "vov0", "condition_choose"]].to_csv(
-        f"{config_manager.project_home}/XGB/file/new_" + (datetime.now() - timedelta(days=1)).strftime("%Y%m%d"),
-        sep="\t",
-        index=False
-    )
-
-    choose_bad = condition_choose.sum()
-    choose_bad_real_bad = condition_choose_real.sum()
-    acc = choose_bad_real_bad / choose_bad
-    logger.info(
-        f"acc:{acc} "
-        f"分子={choose_bad_real_bad} "
-        f"分母={choose_bad} "
-        f"总视频数={predict_df.shape[0]} "
-        f"盈利计算标注vov0大于:{profit_threshold}"
-    )
-
-    surface = predict_df.loc[condition_choose, '曝光占比'].sum()
-    surface_income = predict_df.loc[condition_choose_real, '曝光占比'].sum()
-    logger.info(
-        f"总影响面:{round(surface, 6)} "
-        f"盈利影响面:{round(surface_income, 6)} "
-        f"亏损影响面:{round(surface - surface_income, 6)}"
-    )
-
-    predict_df["profit_loss_value"] = predict_df['分母'] * (predict_df['vov0'] - profit_threshold)
-    profit_loss_value = predict_df.loc[condition_choose, 'profit_loss_value'].sum()
-    profit_value = predict_df.loc[condition_choose_real, 'profit_loss_value'].sum()
-    logger.info(
-        f"总盈亏:{round(profit_loss_value, 1)} "
-        f"纯盈利:{round(profit_value, 1)} "
-        f"纯亏损:{round(profit_loss_value - profit_value, 1)} "
-        f"盈利效率:{round(profit_loss_value / profit_value, 6)}"
-    )
-
-    filtered_vid = predict_df.loc[condition_choose, 'vid'].unique()
-
-    # 写入本地文件
-    np.savetxt(
-        f"{config_manager.project_home}/XGB/file/filtered_vid_{datetime.now().strftime('%Y%m%d')}.csv",
-        filtered_vid,
-        fmt="%d",
-        delimiter=","
-    )
-
-    # 写入Redis
-    # redis_key = f"redis:lower_vov_vid:{datetime.now().strftime('%Y%m%d')}"
-    #
-    # logger.info(f"当前环境为: {config_manager.get_env()}, 要写入的Redis Key为: {redis_key}")
-    # host, port, password = config_manager.get_algorithm_redis_info()
-    # alg_redis = RedisHelper.RedisHelper(host=host, port=port, password=password)
-    # for vid in filtered_vid.tolist():
-    #     alg_redis.add_number_to_set(redis_key, vid)
-    #
-    # alg_redis.set_expire(redis_key, 86400)
-
-
-if __name__ == '__main__':
-    card_json = {
-        "config": {},
-        "i18n_elements": {
-            "zh_cn": [
-                {
-                    "tag": "markdown",
-                    "content": "",
-                    "text_align": "left",
-                    "text_size": "normal"
-                }
-            ]
-        },
-        "i18n_header": {
-            "zh_cn": {
-                "title": {
-                    "tag": "plain_text",
-                    "content": "XGB模型训练预测完成"
-                },
-                "template": "turquoise"
-            }
-        }
-    }
-
-    try:
-        _main()
-
-        msg_text = f"\n- 所属项目: model_monitor" \
-                   f"\n- 所属环境: {config_manager.get_env()}" \
-                   f"\n- 告警描述: VOV预测模型训练和预测完成, 用于低VOV视频过滤"
-        card_json['i18n_elements']['zh_cn'][0]['content'] = msg_text
-
-    except Exception as e:
-        logger.error("VOV过滤XGB模型训练异常: ", e)
-        msg_text = f"\n- 所属项目: model_monitor" \
-                   f"\n- 所属环境: {config_manager.get_env()}" \
-                   f"\n- 告警描述: VOV预测模型训练和预测失败, 用于低VOV视频过滤"
-        card_json['i18n_header']['zh_cn']['template'] = "red"
-        card_json['i18n_header']['zh_cn']["title"]['content'] = "XGB模型训练预测失败"
-        card_json['i18n_elements']['zh_cn'][0]['content'] = msg_text
-    if config_manager.get_env() == "pro":
-        # 发送通知
-        feishu_inform_util.send_card_msg_to_feishu(
-            webhook=config_manager.get_vov_model_inform_feishu_webhook(),
-            card_json=card_json
-        )

+ 4 - 0
client/SLSClient.py

@@ -0,0 +1,4 @@
+
+class SLSClient(object):
+    def __init__(self):
+        pass

+ 4 - 0
helper/RedisHelper.py

@@ -33,6 +33,10 @@ class RedisHelper(object):
         logger.info(f"Redis Delete: {key}")
         self.redis_conn.delete(key)
 
+    def batch_delete(self, *keys):
+        logger.info(f"Redis Batch Delete: {keys}")
+        self.redis_conn.delete(*keys)
+
     def m_get_pipeline(self, keys):
         pipeline = self.redis_conn.pipeline()
         for key in keys:

+ 0 - 35
model/XGBModel.py

@@ -1,35 +0,0 @@
-import numpy as np
-import xgboost as xgb
-
-
-class XGBModel(object):
-    def __init__(self, model_file, features: list):
-        self.model_file = model_file
-        self.model = xgb.Booster(model_file=model_file)
-        self.features = features
-
-    def predict(self, feature_map: dict) -> float:
-        values = np.array([
-            float(feature_map.get(feature, 0.0))
-            for feature in self.features
-        ])
-
-        dm = xgb.DMatrix(values.reshape(1, -1), missing=0.0)
-        return float(self.model.predict(dm, output_margin=False)[0])
-
-    def feature_weight_importance(self):
-        return self.feature_importance("weight")
-
-    def feature_cover_importance(self):
-        return self.feature_importance("cover")
-
-    def feature_gain_importance(self):
-        return self.feature_importance("gain")
-
-    def feature_importance(self, importance_type: str):
-        importance_map = {}
-        score_map = self.model.get_score(importance_type=importance_type)
-        for key in score_map:
-            k = self.features[int(key[1:])]
-            importance_map[k] = score_map[key]
-        return importance_map

+ 0 - 0
model/__init__.py


+ 0 - 245
model/crowd_choose_offline_check.py

@@ -1,245 +0,0 @@
-# This is a sample Python script.
-
-# Press ⌃R to execute it or replace it with your code.
-# Press Double ⇧ to search everywhere for classes, files, tool windows, actions, and settings.
-import hashlib
-import json
-import os
-import subprocess
-import time
-import traceback
-import urllib
-import datetime
-import requests
-import pymysql
-from contextlib import contextmanager
-
-from sqlalchemy import create_engine, Numeric, Float
-from sqlalchemy.orm import sessionmaker
-from sqlalchemy.ext.declarative import declarative_base
-from sqlalchemy import Column, Integer, String, DateTime, Text
-import ssl
-ssl._create_default_https_context = ssl._create_unverified_context
-HOST = 'rm-bp1nx318263k95yo3318.mysql.rds.aliyuncs.com'
-PORT = '3306'
-DATABASE = 'uservideo_bi'
-USERNAME = 'majin'
-PASSWORD = 'E5d2c960fdf3f5f0be5a27eea2f906ef'
-DB_URI = "mysql+pymysql://{username}:{password}@{host}:{port}/{db}?charset=utf8".format(username=USERNAME,
-                                                                                            password=PASSWORD,
-                                                                                            host=HOST, port=PORT,
-                                                                                            db=DATABASE)
-
-# HOST = 'rm-bp1k5853td1r25g3n690.mysql.rds.aliyuncs.com'
-# PORT = '3306'
-# DATABASE = 'mpad'
-# USERNAME = 'majin'
-# PASSWORD = 'e5d2c960fdf3f5f0be5a27eea2f906ef'
-# DB_URI = "mysql+pymysql://{username}:{password}@{host}:{port}/{db}?charset=utf8".format(username=USERNAME,
-#                                                                                         password=PASSWORD,
-#                                                                                         host=HOST, port=PORT,
-#                                                                                         db=DATABASE)
-
-
-Base = declarative_base()
-
-class WECHAT_AD_PUBLISHER_ADUNIT_GENERAL(Base):
-    __tablename__ = 'wechat_ad_publisher_adunit_general'
-    id = Column(Integer, primary_key=True)
-    ad_unit_id = Column(String(1000))
-    ad_unit_name = Column(String(1000))
-    ad_slot = Column(String(1000))
-    click_count = Column(Integer, default=0)
-    click_rate = Column(Float, default=0.0)
-    date = Column(String(1000))
-    ecpm = Column(String(1000))
-    exposure_count = Column(Integer, default=0)
-    exposure_rate = Column(Float, default=0.0)
-    income = Column(Integer, default=0)
-    req_succ_count = Column(Integer, default=0)
-    app_type = Column(Integer, default=0)
-    slot_str = Column(String(1000))
-    date_str = Column(String(1000))
-
-    def __init__(self):
-        print(f"AUNIT_GENERAL.init app_type = {self.app_type}, ad_unit_id = {self.ad_unit_id}")
-
-    def __repr__(self):
-        return '<WECHAT_AD_PUBLISHER_ADUNIT_GENERAL %r>' % self.ad_unit_id
-
-class WECHAT_AD_PUBLISHER_ADPOS_GENERAL(Base):
-    __tablename__ = 'wechat_ad_publisher_adpos_general'
-    id = Column(Integer, primary_key=True)
-    ad_slot = Column(String(1000))
-    click_count = Column(Integer, default=0)
-    click_rate = Column(Float, default=0.0)
-    date = Column(String(1000))
-    ecpm = Column(String(1000))
-    exposure_count = Column(Integer, default=0)
-    exposure_rate = Column(Float, default=0.0)
-    income = Column(Integer, default=0)
-    req_succ_count = Column(Integer, default=0)
-    app_type = Column(Integer, default=0)
-    slot_str = Column(String(1000))
-    date_str = Column(String(1000))
-
-    def __init__(self):
-        print(f"ADPOS_GENERAL.init app_type = {self.app_type}, ad_slot = {self.ad_slot}")
-
-    def __repr__(self):
-        return '<wechat_ad_publisher_adpos_general %r>' % self.ad_slot
-
-
-@contextmanager
-def session_maker(session=None, session_factory=None):
-    try:
-        if session_factory is None:
-            engine = create_engine(DB_URI)
-            session_factory = sessionmaker(bind=engine)
-        if session is None:
-            session = session_factory()
-        yield session
-    except:
-        session.rollback()
-        raise
-    else:
-        session.commit()
-        # logger.debug('session.commit(){}'.format(session))
-    finally:
-        session.close()
-        # logger.debug('session.close(){}'.format(session))
-
-def add_ad_data(data, app_type):
-    # Use a breakpoint in the code line below to debug your script.
-    print(f'Hi, add_ad_data.app_type = {app_type}, data = {data}')  # Press ⌘F8 to toggle the breakpoint.
-    stat_item = data['stat_item']
-    try:
-        with session_maker() as session:
-            wechat_ad_publisher_adunit_general = WECHAT_AD_PUBLISHER_ADUNIT_GENERAL()
-            wechat_ad_publisher_adunit_general.ad_unit_id = data['ad_unit_id']
-            wechat_ad_publisher_adunit_general.ad_unit_name = data['ad_unit_name']
-            wechat_ad_publisher_adunit_general.ad_slot = stat_item['ad_slot']
-            wechat_ad_publisher_adunit_general.click_count = stat_item['click_count']
-            wechat_ad_publisher_adunit_general.click_rate = stat_item['click_rate']
-            wechat_ad_publisher_adunit_general.date = stat_item['date']
-            wechat_ad_publisher_adunit_general.ecpm = stat_item['ecpm']
-            wechat_ad_publisher_adunit_general.exposure_count = stat_item['exposure_count']
-            wechat_ad_publisher_adunit_general.exposure_rate = stat_item['exposure_rate']
-            wechat_ad_publisher_adunit_general.income = stat_item['income']
-            wechat_ad_publisher_adunit_general.req_succ_count = stat_item['req_succ_count']
-            wechat_ad_publisher_adunit_general.slot_str = stat_item['slot_str']
-            wechat_ad_publisher_adunit_general.date_str = stat_item['date'].replace('-','')
-            wechat_ad_publisher_adunit_general.app_type = app_type
-            session.add(wechat_ad_publisher_adunit_general)
-            print(f'add_ad_data is OK!; app_type = {app_type}')
-    except Exception as e:
-        traceback.print_exc()
-        print(f"add_ad_data error: app_type = {app_type}; traceback.format_exc = {traceback.format_exc()}")
-
-def add_ad_adpos_data(stat_item, app_type):
-    # Use a breakpoint in the code line below to debug your script.
-    print(f'Hi, add_ad_adpos_data.app_type = {app_type}, stat_time = {stat_item}')  # Press ⌘F8 to toggle the breakpoint.
-    try:
-        with session_maker() as session:
-            wechat_ad_publisher_adpos_general = WECHAT_AD_PUBLISHER_ADPOS_GENERAL()
-            wechat_ad_publisher_adpos_general.ad_slot = stat_item['ad_slot']
-            wechat_ad_publisher_adpos_general.click_count = stat_item['click_count']
-            wechat_ad_publisher_adpos_general.click_rate = stat_item['click_rate']
-            wechat_ad_publisher_adpos_general.date = stat_item['date']
-            wechat_ad_publisher_adpos_general.ecpm = stat_item['ecpm']
-            wechat_ad_publisher_adpos_general.exposure_count = stat_item['exposure_count']
-            wechat_ad_publisher_adpos_general.exposure_rate = stat_item['exposure_rate']
-            wechat_ad_publisher_adpos_general.income = stat_item['income']
-            wechat_ad_publisher_adpos_general.req_succ_count = stat_item['req_succ_count']
-            wechat_ad_publisher_adpos_general.slot_str = stat_item['slot_str']
-            wechat_ad_publisher_adpos_general.date_str = stat_item['date'].replace('-','')
-            wechat_ad_publisher_adpos_general.app_type = app_type
-            session.add(wechat_ad_publisher_adpos_general)
-            print(f'add_ad_adpos_data is OK; app_type = {app_type}')
-    except Exception as e:
-        traceback.print_exc()
-        print(f"add_ad_adpos_data error: app_type = {app_type}; traceback.format_exc = {traceback.format_exc()}")
-
-
-
-def post_inform(url, content_text):
-    url = url
-    data = json.dumps(content_text)
-    data = bytes(data, 'utf8')
-    print(f"post_inform data = {data}")
-    headers = {"Content-Type": 'application/json'}
-    req = urllib.request.Request(url=url, headers=headers, data=data)
-    try:
-        resp = urllib.request.urlopen(req, timeout=10).read()
-        print(f"post_inform resp = {resp.decode('utf-8')}")
-        return resp.decode('utf-8')
-    except Exception as e:
-        print(e)
-
-def get_inform(url):
-    url = url
-    headers = {"Content-Type": 'application/json'}
-    print(f"get_inform url = {url}")
-    req = urllib.request.Request(url=url, headers=headers)
-    try:
-        resp = urllib.request.urlopen(req, timeout=10).read()
-        print(f"get_inform resp = {resp.decode('utf-8')}")
-        return resp.decode('utf-8')
-    except Exception as e:
-        print(e)
-
-def get_mp_info(app_type):
-    datestr = datetime.datetime.strftime(datetime.datetime.now() - datetime.timedelta(days=+1), '%Y-%m-%d')
-    print(f"get_mp_info: app_type = {app_type} date = {datestr}")
-    time_str = time.strftime("%Y:%m:%d %H")
-    print(f"get_mp_info: app_type= {app_type} time = {time_str}")
-    md5 = hashlib.md5('{}'.format(time_str).encode(encoding='UTF-8')).hexdigest()
-    print(f"get_mp_info: app_type = {app_type} md5 = {md5}")
-
-    getliveaccesstoken_url = "https://longvideoapi.piaoquantv.com/longvideoapi/weixin/getWxAccessToken/{}".format(app_type)
-    print(f"get_mp_info getliveaccesstoken_url = {getliveaccesstoken_url}")
-    ret = get_inform(getliveaccesstoken_url)
-    data = json.loads(ret).get('data',{})
-    print(f"get_mp_info app_type = {app_type} getWxAccessToken date = {data}")
-    with session_maker() as session:
-        task = session.query(WECHAT_AD_PUBLISHER_ADUNIT_GENERAL).filter_by(date=datestr,app_type=app_type).first()
-        if task is None:
-            getweanalysisappiddailyvisittrend_url = 'https://api.weixin.qq.com/publisher/stat?action=publisher_adunit_general&access_token={}&page=1&page_size=100&start_date={}&end_date={}'.format(
-                data, datestr, datestr)
-            print(f"get_mp_info app_type = {app_type} publisher/stat adunit = {getweanalysisappiddailyvisittrend_url}")
-            ret = get_inform(getweanalysisappiddailyvisittrend_url)
-            print(f"get_mp_info app_type = {app_type} publisher/stat adunit result = {ret}")
-            list = json.loads(ret).get('list',[])
-            for item in list:
-                add_ad_data(item, app_type)
-
-        task = session.query(WECHAT_AD_PUBLISHER_ADPOS_GENERAL).filter_by(date=datestr, app_type=app_type).first()
-        if task is None:
-            getweanalysisappiddailyvisittrend_url = 'https://api.weixin.qq.com/publisher/stat?action=publisher_adpos_general&access_token={}&page=1&page_size=100&start_date={}&end_date={}'.format(
-                data, datestr, datestr)
-            print(f"get_mp_info app_type = {app_type} publisher/stat adops = {getweanalysisappiddailyvisittrend_url}")
-            ret = get_inform(getweanalysisappiddailyvisittrend_url)
-            print(f"get_mp_info app_type = {app_type} publisher/stat adops result = {ret}")
-            list = json.loads(ret).get('list',[])
-            for item in list:
-                add_ad_adpos_data(item, app_type)
-            summary = json.loads(ret)['summary']
-            summary['ad_slot'] = 'SLOT_ID_WEAPP_ALL'
-            summary['date'] = datestr
-            summary['slot_str'] = 'summary'
-            add_ad_adpos_data(summary, app_type)
-
-# Press the green button in the gutter to run the script.
-if __name__ == '__main__':
-    app_type_list = [0,2,3,4,5,6,17,18,19,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36]
-    # app_type_list = [2,23,24,25]
-    for app_type in app_type_list:
-        print(f"start app_type = {app_type}")
-        try:
-            get_mp_info(app_type)
-        except Exception as e:
-            print(f"app_type {app_type} get data error: {traceback.format_exc()}")
-        print(f"end app_type = {app_type}")
-        print("")
-# See PyCharm help at https://www.jetbrains.com/help/pycharm/
-

+ 0 - 421
model/feature.py

@@ -1,421 +0,0 @@
-import glob
-import os.path
-from datetime import timedelta
-
-import numpy as np
-import pandas as pd
-import xgboost as xgb
-
-from model.XGBModel import XGBModel
-
-features = [
-    "cpa",
-    "b2_3h_ctr",
-    "b2_3h_ctcvr",
-    "b2_3h_cvr",
-    "b2_3h_conver",
-    "b2_3h_ecpm",
-    "b2_3h_click",
-    "b2_3h_conver*log(view)",
-    "b2_3h_conver*ctcvr",
-    "b2_6h_ctr",
-    "b2_6h_ctcvr",
-    "b2_6h_cvr",
-    "b2_6h_conver",
-    "b2_6h_ecpm",
-    "b2_6h_click",
-    "b2_6h_conver*log(view)",
-    "b2_6h_conver*ctcvr",
-    "b2_12h_ctr",
-    "b2_12h_ctcvr",
-    "b2_12h_cvr",
-    "b2_12h_conver",
-    "b2_12h_ecpm",
-    "b2_12h_click",
-    "b2_12h_conver*log(view)",
-    "b2_12h_conver*ctcvr",
-    "b2_1d_ctr",
-    "b2_1d_ctcvr",
-    "b2_1d_cvr",
-    "b2_1d_conver",
-    "b2_1d_ecpm",
-    "b2_1d_click",
-    "b2_1d_conver*log(view)",
-    "b2_1d_conver*ctcvr",
-    "b2_3d_ctr",
-    "b2_3d_ctcvr",
-    "b2_3d_cvr",
-    "b2_3d_conver",
-    "b2_3d_ecpm",
-    "b2_3d_click",
-    "b2_3d_conver*log(view)",
-    "b2_3d_conver*ctcvr",
-    "b2_7d_ctr",
-    "b2_7d_ctcvr",
-    "b2_7d_cvr",
-    "b2_7d_conver",
-    "b2_7d_ecpm",
-    "b2_7d_click",
-    "b2_7d_conver*log(view)",
-    "b2_7d_conver*ctcvr",
-    "b3_3h_ctr",
-    "b3_3h_ctcvr",
-    "b3_3h_cvr",
-    "b3_3h_conver",
-    "b3_3h_ecpm",
-    "b3_3h_click",
-    "b3_3h_conver*log(view)",
-    "b3_3h_conver*ctcvr",
-    "b3_6h_ctr",
-    "b3_6h_ctcvr",
-    "b3_6h_cvr",
-    "b3_6h_conver",
-    "b3_6h_ecpm",
-    "b3_6h_click",
-    "b3_6h_conver*log(view)",
-    "b3_6h_conver*ctcvr",
-    "b3_12h_ctr",
-    "b3_12h_ctcvr",
-    "b3_12h_cvr",
-    "b3_12h_conver",
-    "b3_12h_ecpm",
-    "b3_12h_click",
-    "b3_12h_conver*log(view)",
-    "b3_12h_conver*ctcvr",
-    "b3_1d_ctr",
-    "b3_1d_ctcvr",
-    "b3_1d_cvr",
-    "b3_1d_conver",
-    "b3_1d_ecpm",
-    "b3_1d_click",
-    "b3_1d_conver*log(view)",
-    "b3_1d_conver*ctcvr",
-    "b3_3d_ctr",
-    "b3_3d_ctcvr",
-    "b3_3d_cvr",
-    "b3_3d_conver",
-    "b3_3d_ecpm",
-    "b3_3d_click",
-    "b3_3d_conver*log(view)",
-    "b3_3d_conver*ctcvr",
-    "b3_7d_ctr",
-    "b3_7d_ctcvr",
-    "b3_7d_cvr",
-    "b3_7d_conver",
-    "b3_7d_ecpm",
-    "b3_7d_click",
-    "b3_7d_conver*log(view)",
-    "b3_7d_conver*ctcvr",
-    "b4_3h_ctr",
-    "b4_3h_ctcvr",
-    "b4_3h_cvr",
-    "b4_3h_conver",
-    "b4_3h_ecpm",
-    "b4_3h_click",
-    "b4_3h_conver*log(view)",
-    "b4_3h_conver*ctcvr",
-    "b4_6h_ctr",
-    "b4_6h_ctcvr",
-    "b4_6h_cvr",
-    "b4_6h_conver",
-    "b4_6h_ecpm",
-    "b4_6h_click",
-    "b4_6h_conver*log(view)",
-    "b4_6h_conver*ctcvr",
-    "b4_12h_ctr",
-    "b4_12h_ctcvr",
-    "b4_12h_cvr",
-    "b4_12h_conver",
-    "b4_12h_ecpm",
-    "b4_12h_click",
-    "b4_12h_conver*log(view)",
-    "b4_12h_conver*ctcvr",
-    "b4_1d_ctr",
-    "b4_1d_ctcvr",
-    "b4_1d_cvr",
-    "b4_1d_conver",
-    "b4_1d_ecpm",
-    "b4_1d_click",
-    "b4_1d_conver*log(view)",
-    "b4_1d_conver*ctcvr",
-    "b4_3d_ctr",
-    "b4_3d_ctcvr",
-    "b4_3d_cvr",
-    "b4_3d_conver",
-    "b4_3d_ecpm",
-    "b4_3d_click",
-    "b4_3d_conver*log(view)",
-    "b4_3d_conver*ctcvr",
-    "b4_7d_ctr",
-    "b4_7d_ctcvr",
-    "b4_7d_cvr",
-    "b4_7d_conver",
-    "b4_7d_ecpm",
-    "b4_7d_click",
-    "b4_7d_conver*log(view)",
-    "b4_7d_conver*ctcvr",
-    "b5_3h_ctr",
-    "b5_3h_ctcvr",
-    "b5_3h_cvr",
-    "b5_3h_conver",
-    "b5_3h_ecpm",
-    "b5_3h_click",
-    "b5_3h_conver*log(view)",
-    "b5_3h_conver*ctcvr",
-    "b5_6h_ctr",
-    "b5_6h_ctcvr",
-    "b5_6h_cvr",
-    "b5_6h_conver",
-    "b5_6h_ecpm",
-    "b5_6h_click",
-    "b5_6h_conver*log(view)",
-    "b5_6h_conver*ctcvr",
-    "b5_12h_ctr",
-    "b5_12h_ctcvr",
-    "b5_12h_cvr",
-    "b5_12h_conver",
-    "b5_12h_ecpm",
-    "b5_12h_click",
-    "b5_12h_conver*log(view)",
-    "b5_12h_conver*ctcvr",
-    "b5_1d_ctr",
-    "b5_1d_ctcvr",
-    "b5_1d_cvr",
-    "b5_1d_conver",
-    "b5_1d_ecpm",
-    "b5_1d_click",
-    "b5_1d_conver*log(view)",
-    "b5_1d_conver*ctcvr",
-    "b5_3d_ctr",
-    "b5_3d_ctcvr",
-    "b5_3d_cvr",
-    "b5_3d_conver",
-    "b5_3d_ecpm",
-    "b5_3d_click",
-    "b5_3d_conver*log(view)",
-    "b5_3d_conver*ctcvr",
-    "b5_7d_ctr",
-    "b5_7d_ctcvr",
-    "b5_7d_cvr",
-    "b5_7d_conver",
-    "b5_7d_ecpm",
-    "b5_7d_click",
-    "b5_7d_conver*log(view)",
-    "b5_7d_conver*ctcvr",
-    "b8_3h_ctr",
-    "b8_3h_ctcvr",
-    "b8_3h_cvr",
-    "b8_3h_conver",
-    "b8_3h_ecpm",
-    "b8_3h_click",
-    "b8_3h_conver*log(view)",
-    "b8_3h_conver*ctcvr",
-    "b8_6h_ctr",
-    "b8_6h_ctcvr",
-    "b8_6h_cvr",
-    "b8_6h_conver",
-    "b8_6h_ecpm",
-    "b8_6h_click",
-    "b8_6h_conver*log(view)",
-    "b8_6h_conver*ctcvr",
-    "b8_12h_ctr",
-    "b8_12h_ctcvr",
-    "b8_12h_cvr",
-    "b8_12h_conver",
-    "b8_12h_ecpm",
-    "b8_12h_click",
-    "b8_12h_conver*log(view)",
-    "b8_12h_conver*ctcvr",
-    "b8_1d_ctr",
-    "b8_1d_ctcvr",
-    "b8_1d_cvr",
-    "b8_1d_conver",
-    "b8_1d_ecpm",
-    "b8_1d_click",
-    "b8_1d_conver*log(view)",
-    "b8_1d_conver*ctcvr",
-    "b8_3d_ctr",
-    "b8_3d_ctcvr",
-    "b8_3d_cvr",
-    "b8_3d_conver",
-    "b8_3d_ecpm",
-    "b8_3d_click",
-    "b8_3d_conver*log(view)",
-    "b8_3d_conver*ctcvr",
-    "b8_7d_ctr",
-    "b8_7d_ctcvr",
-    "b8_7d_cvr",
-    "b8_7d_conver",
-    "b8_7d_ecpm",
-    "b8_7d_click",
-    "b8_7d_conver*log(view)",
-    "b8_7d_conver*ctcvr",
-    "b6_7d_ctr",
-    "b6_7d_ctcvr",
-    "b6_7d_cvr",
-    "b6_7d_conver",
-    "b6_7d_ecpm",
-    "b6_7d_click",
-    "b6_7d_conver*log(view)",
-    "b6_7d_conver*ctcvr",
-    "b6_14d_ctr",
-    "b6_14d_ctcvr",
-    "b6_14d_cvr",
-    "b6_14d_conver",
-    "b6_14d_ecpm",
-    "b6_14d_click",
-    "b6_14d_conver*log(view)",
-    "b6_14d_conver*ctcvr",
-    "b7_7d_ctr",
-    "b7_7d_ctcvr",
-    "b7_7d_cvr",
-    "b7_7d_conver",
-    "b7_7d_ecpm",
-    "b7_7d_click",
-    "b7_7d_conver*log(view)",
-    "b7_7d_conver*ctcvr",
-    "b7_14d_ctr",
-    "b7_14d_ctcvr",
-    "b7_14d_cvr",
-    "b7_14d_conver",
-    "b7_14d_ecpm",
-    "b7_14d_click",
-    "b7_14d_conver*log(view)",
-    "b7_14d_conver*ctcvr",
-    "viewAll",
-    "clickAll",
-    "converAll",
-    "incomeAll",
-    "ctr_all",
-    "ctcvr_all",
-    "cvr_all",
-    "ecpm_all",
-    "timediff_view",
-    "timediff_click",
-    "timediff_conver",
-    "actionstatic_view",
-    "actionstatic_click",
-    "actionstatic_conver",
-    "actionstatic_income",
-    "actionstatic_ctr",
-    "actionstatic_ctcvr",
-    "actionstatic_cvr",
-    "e1_tags_3d_matchnum",
-    "e1_tags_3d_maxscore",
-    "e1_tags_3d_avgscore",
-    "e1_tags_7d_matchnum",
-    "e1_tags_7d_maxscore",
-    "e1_tags_7d_avgscore",
-    "e1_tags_14d_matchnum",
-    "e1_tags_14d_maxscore",
-    "e1_tags_14d_avgscore",
-    "e2_tags_3d_matchnum",
-    "e2_tags_3d_maxscore",
-    "e2_tags_3d_avgscore",
-    "e2_tags_7d_matchnum",
-    "e2_tags_7d_maxscore",
-    "e2_tags_7d_avgscore",
-    "e2_tags_14d_matchnum",
-    "e2_tags_14d_maxscore",
-    "e2_tags_14d_avgscore",
-    "d1_feature_3h_ctr",
-    "d1_feature_3h_ctcvr",
-    "d1_feature_3h_cvr",
-    "d1_feature_3h_conver",
-    "d1_feature_3h_ecpm",
-    "d1_feature_6h_ctr",
-    "d1_feature_6h_ctcvr",
-    "d1_feature_6h_cvr",
-    "d1_feature_6h_conver",
-    "d1_feature_6h_ecpm",
-    "d1_feature_12h_ctr",
-    "d1_feature_12h_ctcvr",
-    "d1_feature_12h_cvr",
-    "d1_feature_12h_conver",
-    "d1_feature_12h_ecpm",
-    "d1_feature_1d_ctr",
-    "d1_feature_1d_ctcvr",
-    "d1_feature_1d_cvr",
-    "d1_feature_1d_conver",
-    "d1_feature_1d_ecpm",
-    "d1_feature_3d_ctr",
-    "d1_feature_3d_ctcvr",
-    "d1_feature_3d_cvr",
-    "d1_feature_3d_conver",
-    "d1_feature_3d_ecpm",
-    "d1_feature_7d_ctr",
-    "d1_feature_7d_ctcvr",
-    "d1_feature_7d_cvr",
-    "d1_feature_7d_conver",
-    "d1_feature_7d_ecpm",
-    "vid_rank_ctr_1d",
-    "vid_rank_ctr_3d",
-    "vid_rank_ctr_7d",
-    "vid_rank_ctr_14d",
-    "vid_rank_ctcvr_1d",
-    "vid_rank_ctcvr_3d",
-    "vid_rank_ctcvr_7d",
-    "vid_rank_ctcvr_14d",
-    "vid_rank_ecpm_1d",
-    "vid_rank_ecpm_3d",
-    "vid_rank_ecpm_7d",
-    "vid_rank_ecpm_14d"
-]
-
-
-def load_model_and_score(model_path, feature_map):
-    model = xgb.Booster()
-    model.load_model(f"{model_path}/data/XGBoostClassificationModel")
-    model.set_param({"missing": 0.0})
-
-    values = np.array([
-        float(feature_map.get(feature, 0.0))
-        for feature in features
-    ], dtype=np.float32)
-
-    dm = xgb.DMatrix(values.reshape(1, -1), missing=0.0)
-    return float(model.predict(dm, output_margin=False)[0])
-
-
-def _multi_importance_flat_map(importance_map: dict):
-    result = []
-    all_features = set(key for inner_dict in importance_map.values() for key in inner_dict.keys())
-    for feature in all_features:
-        item = {
-            "feature": feature,
-        }
-        for key in importance_map:
-            if feature in importance_map[key]:
-                item[key] = importance_map[key][feature]
-        result.append(item)
-    return result
-
-
-def _main():
-    model_path = "/Users/zhao/Desktop/tzld/ad/model"
-    all_model = glob.glob(f"{model_path}/*")
-    model_dict = {}
-    for e in all_model:
-        if "model_xgb_351_1000_v2" in e:
-            model_dict[e] = XGBModel(model_file=f"{e}/data/XGBoostClassificationModel", features=features)
-
-    weight_dict = {}
-    cover_dict = {}
-    gain_dict = {}
-    for key in model_dict:
-        dt = os.path.basename(key)[-9:]
-        weight_dict[dt] = model_dict[key].feature_weight_importance()
-        cover_dict[dt] = model_dict[key].feature_cover_importance()
-        gain_dict[dt] = model_dict[key].feature_gain_importance()
-
-    weight = _multi_importance_flat_map(dict(sorted(weight_dict.items())))
-    cover = _multi_importance_flat_map(dict(sorted(cover_dict.items())))
-    gain = _multi_importance_flat_map(dict(sorted(gain_dict.items())))
-
-    pd.DataFrame(weight).to_csv("/Users/zhao/Desktop/weight.csv", index=False)
-    pd.DataFrame(cover).to_csv("/Users/zhao/Desktop/cover.csv", index=False)
-    pd.DataFrame(gain).to_csv("/Users/zhao/Desktop/gain.csv", index=False)
-
-
-if __name__ == '__main__':
-    _main()

+ 0 - 197
model/model_predict_analyse_20241115.py

@@ -1,197 +0,0 @@
-import gzip
-import os.path
-
-import pandas as pd
-from hdfs import InsecureClient
-
-client = InsecureClient("http://master-1-1.c-7f31a3eea195cb73.cn-hangzhou.emr.aliyuncs.com:9870", user="spark")
-
-SEGMENT_BASE_PATH = os.environ.get("SEGMENT_BASE_PATH", "/Users/zhao/Desktop/tzld/XGB/predict_cache")
-PREDICT_CACHE_PATH = os.environ.get("PREDICT_CACHE_PATH", "/Users/zhao/Desktop/tzld/XGB/predict_cache")
-
-
-def parse_predict_line(line: str) -> [bool, dict]:
-    sp = line.replace("\n", "").split("\t")
-    if len(sp) == 4:
-        label = int(sp[0])
-        cid = sp[3].split("_")[0]
-        score = float(sp[2].replace("[", "").replace("]", "").split(",")[1])
-        return True, {
-            "label": label,
-            "cid": cid,
-            "score": score
-        }
-    return False, {}
-
-
-def read_predict_file(file_path: str) -> pd.DataFrame:
-    result = []
-    if file_path.startswith("/dw"):
-        if not file_path.endswith("/"):
-            file_path += "/"
-        for file in client.list(file_path):
-            with client.read(file_path + file) as reader:
-                with gzip.GzipFile(fileobj=reader, mode="rb") as gz_file:
-                    for line in gz_file.read().decode("utf-8").split("\n"):
-                        b, d = parse_predict_line(line)
-                        if b: result.append(d)
-    else:
-        with open(file_path, "r") as f:
-            for line in f.readlines():
-                b, d = parse_predict_line(line)
-                if b: result.append(d)
-    return pd.DataFrame(result)
-
-
-def calibration_file_save(df: pd.DataFrame, file_path: str):
-    if file_path.startswith("/dw"):
-        # 完整的分段文件保存
-        with client.write(file_path, encoding='utf-8', overwrite=True) as writer:
-            writer.write(df.to_csv(sep="\t", index=False))
-    else:
-        df.tocsv(file_path, sep="\t", index=False)
-
-
-def predict_local_save_for_auc(old_df: pd.DataFrame, new_df: pd.DataFrame):
-    """
-    本地保存一份评估结果, 计算AUC使用
-    """
-    d = {"old": old_df, "new": new_df}
-    for key in d:
-        df = d[key]
-        if 'score' in df.columns:
-            score_df = df[['label', "score"]]
-            score_df.to_csv(f"{PREDICT_CACHE_PATH}/{key}_1.txt", sep="\t", index=False, header=False)
-        if 'score_2' in df.columns:
-            score_2_df = d[key][['label', "score_2"]]
-            score_2_df.to_csv(f"{PREDICT_CACHE_PATH}/{key}_2.txt", sep="\t", index=False, header=False)
-
-
-def save_full_calibration_file(df: pd.DataFrame, segment_file_path: str):
-    if segment_file_path.startswith("/dw"):
-        # 完整的分段文件保存
-        with client.write(segment_file_path, encoding='utf-8', overwrite=True) as writer:
-            writer.write(df.to_csv(sep="\t", index=False))
-    else:
-        df.to_csv(segment_file_path, sep="\t", index=False)
-
-
-def get_predict_calibration_file(df: pd.DataFrame, predict_basename: str) -> [pd.DataFrame]:
-    """
-    计算模型分的diff_rate
-    """
-    agg_df = predict_df_agg(df)
-    agg_df['diff_rate'] = (agg_df['score_avg'] / agg_df['true_ctcvr'] - 1).mask(agg_df['true_ctcvr'] == 0, 0).round(6)
-    condition = 'view > 1000 and diff_rate >= 0.2'
-    save_full_calibration_file(agg_df, f"{SEGMENT_BASE_PATH}/{predict_basename}.txt")
-    calibration = agg_df[(agg_df['view'] > 1000) & ((agg_df['diff_rate'] >= 0.2) | (agg_df['diff_rate'] <= 0.2)) & agg_df['diff_rate'] != 0]
-    return calibration
-
-
-def get_predict_basename(predict_path) -> [str]:
-    """
-    获取文件路径的最后一部分,作为与模型关联的文件名
-    """
-    predict_basename = os.path.basename(predict_path)
-    if predict_basename.endswith("/"):
-        predict_basename = predict_basename[:-1]
-
-    return predict_basename
-
-
-def calc_calibration_score2(df: pd.DataFrame, calibration_df: pd.DataFrame) -> [pd.DataFrame]:
-    calibration_df = calibration_df[['cid', 'diff_rate']]
-    df = pd.merge(df, calibration_df, on='cid', how='left').fillna(0)
-    df['score_2'] = df['score'] / (1 + df['diff_rate'])
-    return df
-
-
-def predict_df_agg(df: pd.DataFrame) -> [pd.DataFrame]:
-    # 基础聚合操作
-    agg_operations = {
-        'view': ('cid', 'size'),
-        'conv': ('label', 'sum'),
-        'score_avg': ('score', lambda x: round(x.mean(), 6)),
-    }
-
-    # 如果存在 score_2 列,则增加相关聚合
-    if "score_2" in df.columns:
-        agg_operations['score_2_avg'] = ('score_2', lambda x: round(x.mean(), 6))
-
-    grouped_df = df.groupby("cid").agg(**agg_operations).reset_index()
-    grouped_df['true_ctcvr'] = grouped_df['conv'] / grouped_df['view']
-
-    return grouped_df
-
-
-def _main(old_predict_path: str, new_predict_path: str, calibration_file: str, analyse_file: str):
-    old_df = read_predict_file(old_predict_path)
-    new_df = read_predict_file(new_predict_path)
-
-    old_calibration_df = get_predict_calibration_file(old_df, get_predict_basename(old_predict_path))
-    old_df = calc_calibration_score2(old_df, old_calibration_df)
-
-    new_calibration_df = get_predict_calibration_file(new_df, get_predict_basename(new_predict_path))
-    new_df = calc_calibration_score2(new_df, new_calibration_df)
-
-    # 本地保存label、score以及校准后的score,用于计算AUC等信息
-    predict_local_save_for_auc(old_df, new_df)
-
-    # 新模型校准文件保存本地,用于同步OSS
-    new_calibration_df[['cid', 'diff_rate']].to_csv(calibration_file, sep="\t", index=False, header=False)
-
-    old_agg_df = predict_df_agg(old_df)
-    new_agg_df = predict_df_agg(new_df)
-
-    # 字段重命名,和列过滤
-    old_agg_df.rename(columns={'score_avg': 'old_score_avg', 'score_2_avg': 'old_score_2_avg'}, inplace=True)
-    new_agg_df.rename(columns={'score_avg': 'new_score_avg', 'score_2_avg': 'new_score_2_avg'}, inplace=True)
-    old_group_df = old_agg_df[['cid', 'view', 'conv', 'true_ctcvr', 'old_score_avg', 'old_score_2_avg']]
-    new_group_df = new_agg_df[['cid', 'new_score_avg', 'new_score_2_avg']]
-    merged = pd.merge(old_group_df, new_group_df, on='cid', how='left')
-
-    # 计算与真实ctcvr的差异值
-    merged["(new-true)/true"] = (merged['new_score_avg'] / merged['true_ctcvr'] - 1).mask(merged['true_ctcvr'] == 0, 0)
-    merged["(old-true)/true"] = (merged['old_score_avg'] / merged['true_ctcvr'] - 1).mask(merged['true_ctcvr'] == 0, 0)
-
-    # 计算校准后的模型分与ctcvr的差异值
-    merged["(new2-true)/true"] = (merged['new_score_2_avg'] / merged['true_ctcvr'] - 1).mask(merged['true_ctcvr'] == 0, 0)
-    merged["(old2-true)/true"] = (merged['old_score_2_avg'] / merged['true_ctcvr'] - 1).mask(merged['true_ctcvr'] == 0, 0)
-
-    # 按照曝光排序,写入本地文件
-    merged = merged.sort_values(by=['view'], ascending=False)
-    merged = merged[[
-        'cid', 'view', "conv", "true_ctcvr",
-        "old_score_avg", "new_score_avg", "(old-true)/true", "(new-true)/true",
-        "old_score_2_avg", "new_score_2_avg", "(old2-true)/true", "(new2-true)/true",
-    ]]
-
-    # 根据文件名保存不同的格式
-    if analyse_file.endswith(".csv"):
-        merged.to_csv(analyse_file, index=False)
-    else:
-        with open(analyse_file, "w") as writer:
-            writer.write(merged.to_string(index=False))
-    print("0")
-
-
-if __name__ == '__main__':
-    _main(
-        old_predict_path="/Users/zhao/Desktop/tzld/XGB/predict_result/20241210_351_1000_1201_1207.txt",
-        new_predict_path="/Users/zhao/Desktop/tzld/XGB/predict_result/20241210_351_1000_1203_1209.txt",
-        calibration_file="/Users/zhao/Desktop/tzld/XGB/calibration_file/model_xgb_351_1000_v2_calibration.txt",
-        analyse_file="/Users/zhao/Desktop/tzld/XGB/predict_cache/analyse_file.txt"
-    )
-    # parser = argparse.ArgumentParser(description="model_predict_analyse_20241101.py")
-    # parser.add_argument("-op", "--old_predict_path", required=True, help="老模型评估结果")
-    # parser.add_argument("-np", "--new_predict_path", required=True, help="新模型评估结果")
-    # parser.add_argument("-af", "--analyse_file", required=True, help="最后计算结果的保存路径")
-    # parser.add_argument("-cf", "--calibration_file", required=True, help="线上使用的segment文件保存路径")
-    # args = parser.parse_args()
-    #
-    # _main(
-    #     old_predict_path=args.old_predict_path,
-    #     new_predict_path=args.new_predict_path,
-    #     calibration_file=args.calibration_file,
-    #     analyse_file=args.analyse_file
-    # )

+ 0 - 95
model/ros_multi_class_model_predice_analyse.py

@@ -1,95 +0,0 @@
-import numpy as np
-from sklearn.metrics import roc_auc_score
-
-
-def parse_line(line):
-    """ 解析每一行数据 """
-    parts = line.strip().split("\t")
-    label = int(parts[0])
-    scores = np.array([float(x) for x in parts[2].strip("[]").split(",")])
-
-    # 找到最大值索引
-    max_index = np.argmax(scores)
-
-    # 生成 (label, score) 形式的 aucs
-    aucs = np.array([(1 if i == label else 0, scores[i]) for i in range(len(scores))])
-
-    # 生成 (是否为真实 label, 是否为最大值) 的 accuracyRate
-    accuracy_rate = np.array([(1 if i == label else 0, 1 if i == max_index else 0) for i in range(len(scores))])
-
-    return aucs, accuracy_rate
-
-
-def compute_auc(auc_data):
-    """ 计算 AUC 使用 roc_auc_score """
-    num_classes = len(auc_data[0])  # 8 classes
-    auc_scores = []
-
-    for i in range(num_classes):
-        col_data = np.array([row[i] for row in auc_data])  # 取第 i 列
-        labels, scores = col_data[:, 0], col_data[:, 1]
-
-        # 计算 AUC
-        auc = roc_auc_score(labels, scores)
-        auc_scores.append(auc)
-
-    return auc_scores
-
-
-def compute_accuracy_rate(acc_data):
-    """ 计算 accuracy """
-    num_classes = len(acc_data[0])  # 8 classes
-
-    # 全局 accuracy 计算
-    acc_flatten = np.vstack(acc_data)
-    global_correct = np.sum((acc_flatten[:, 0] == 1) & (acc_flatten[:, 1] == 1))
-    total_count = acc_flatten.shape[0] / num_classes
-    global_accuracy = global_correct / total_count
-
-    # 按 label 计算 accuracy
-    per_label_accuracy = []
-    for i in range(num_classes):
-        col_data = np.array([row[i] for row in acc_data])  # 取第 i 列
-
-        # 过滤这个分类的数据
-        class_all_data = col_data[col_data[:, 1] == 1]
-        # 过滤这个分类中预估对的数据
-        positive_data = class_all_data[class_all_data[:, 0] == 1]
-
-        class_cnt = class_all_data.shape[0]
-        positive_cnt = positive_data.shape[0]
-
-        accuracy = 0 if class_cnt == 0 else positive_cnt / class_cnt
-        per_label_accuracy.append(accuracy)
-
-    return global_accuracy, per_label_accuracy
-
-
-if __name__ == "__main__":
-    file_path = "/Users/zhao/Desktop/tzld/ros/ros_predict_20250302.txt"  # 本地文件路径
-
-    # 读取数据
-    with open(file_path, "r") as f:
-        data_lines = f.readlines()
-
-    # 解析数据
-    parsed_data = [parse_line(line) for line in data_lines]
-    auc_data = [item[0] for item in parsed_data]
-    acc_data = [item[1] for item in parsed_data]
-
-    # 计算 AUC
-    auc_scores = compute_auc(auc_data)
-
-    # 计算 Accuracy
-    global_acc, per_label_acc = compute_accuracy_rate(acc_data)
-
-    # 打印结果
-    print("AUC Scores:")
-    for i, auc in enumerate(auc_scores):
-        print(f"Label {i}: AUC = {auc:.4f}")
-
-    print(f"\nGlobal Accuracy: {global_acc:.4f}")
-
-    print("\nPer Label Accuracy:")
-    for i, acc in enumerate(per_label_acc):
-        print(f"Label {i}: Accuracy = {acc:.4f}")

+ 0 - 0
model/segment_calibration_check.py


+ 0 - 0
XGB/__init__.py → monitor/__init__.py


+ 80 - 0
monitor/automation_provide_job_monitor.py

@@ -0,0 +1,80 @@
+import datetime
+
+from aliyun.log import LogClient
+from aliyun.log.auth import AUTH_VERSION_4
+
+from util import feishu_inform_util
+
+endpoint = "cn-hangzhou.log.aliyuncs.com"
+access_key = "RfSjdiWwED1sGFlsjXv0DlfTnZTG1P"
+access_key_id = "LTAIWYUujJAm7CbH"
+project = "crawler-scheduler"
+log_store = "aigc-provider"
+query_sql = "* | select crawlerMode, result, if(reason='null', '成功', reason) as reason, count(distinct videoId) as videoIdCnt, count(distinct crawlerPlanId) as crawlerPlanIdCnt  from log where reason not in ('该账号已经存在爬取计划,跳过执行', '该视频近期已经处理过', '该Topic已经创建过爬取计划', '该关键词已经创建过爬取计划') group by crawlerMode, result, reason order by crawlerMode, result desc, reason"
+
+client = LogClient(endpoint=endpoint, accessKey=access_key, accessKeyId=access_key_id, auth_version=AUTH_VERSION_4, region='cn-hangzhou')
+webhook = 'https://open.feishu.cn/open-apis/bot/v2/hook/9f5c5cce-5eb2-4731-b368-33926f5549f9'
+
+
+def send_feishu_card_msg(title, content):
+    card_json = {
+        "schema": "2.0",
+        "header": {
+            "title": {
+                "tag": "plain_text",
+                "content": title
+            },
+            "template": "blue"
+        },
+        "body": {
+            "elements": [
+                {
+                    "tag": "markdown",
+                    "content": content,
+                    "text_align": "left",
+                    "text_size": "normal",
+                    "element_id": "taskExecuteCnt"
+                }
+            ]
+        }
+    }
+    feishu_inform_util.send_card_msg_to_feishu(webhook, card_json)
+
+
+def main():
+    # 获取当前日期
+    today = datetime.datetime.now()
+
+    # 当天开始时间(00:00:00)
+    start_of_day = datetime.datetime.combine(today.date(), datetime.time.min)
+    # 当天结束时间(23:59:59.999999)
+    end_of_day = datetime.datetime.combine(today.date(), datetime.time.max)
+
+    # 转换为时间戳(秒级)
+    start_timestamp = int(start_of_day.timestamp())
+    end_timestamp = int(end_of_day.timestamp())
+
+    resp = client.get_log(project=project, logstore=log_store, from_time=start_timestamp, to_time=end_timestamp, query=query_sql)
+    log_data = resp.get_body().get('data')
+
+    crawler_mode_set = set()
+    for datum in log_data:
+        crawler_mode_set.add(datum.get('crawlerMode'))
+
+    for crawler_mode in crawler_mode_set:
+        title = f"{crawler_mode} 执行情况监控"
+        content = "| reason | videoIdCnt | crawlerPlanIdCnt |\n"
+        content += "| --- | --- | --- |\n"
+        for datum in resp.get_body().get('data'):
+            if crawler_mode != datum.get('crawlerMode'):
+                continue
+            reason = datum.get('reason')
+            video_id_cnt = datum.get('videoIdCnt')
+            crawler_plan_id_cnt = datum.get('crawlerPlanIdCnt')
+            content += f"| {reason} | {video_id_cnt} | {crawler_plan_id_cnt} |\n"
+
+        send_feishu_card_msg(title, content)
+
+
+if __name__ == "__main__":
+    main()

+ 0 - 0
script/feature_spark_monitor.py → monitor/feature_spark_monitor.py


+ 0 - 0
script/hadoop_monitor.py → monitor/hadoop_monitor.py


ファイルの差分が大きいため隠しています
+ 0 - 45
script/alg_table_info.py


+ 0 - 90
script/data_download.py

@@ -1,90 +0,0 @@
-import time
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from typing import Callable, Sequence
-
-from client import ODPSClient
-
-odps_client = ODPSClient.ODPSClient()
-
-
-def process_tasks(tasks: Sequence[Callable[[], None]], max_workers: int) -> None:
-    """
-    通用任务处理器,将任务分批并发执行。
-
-    :param tasks: 一个可迭代对象,每个元素是一个 callable(无需参数)
-    :param max_workers: 最大并发数
-    """
-    total_tasks = len(tasks)
-    task_counter = 0
-
-    with ThreadPoolExecutor(max_workers=max_workers) as executor:
-        future_tasks = {}
-
-        for task in tasks:
-            task_counter += 1
-            print(f"提交任务: 第 {task_counter}/{total_tasks} 个任务")
-
-            # 提交任务
-            future = executor.submit(task)
-            future_tasks[future] = (task, task_counter)
-
-            time.sleep(0.01)
-
-            # 控制每批次提交的任务数
-            if len(future_tasks) == max_workers or task_counter == total_tasks:
-                # 等待当前批次完成
-                for future in as_completed(future_tasks):
-                    task, counter = future_tasks[future]
-                    try:
-                        # 获取任务执行结果
-                        future.result()
-                        print(f"任务完成: 第 {counter}/{total_tasks} 个任务")
-                    except Exception as exc:
-                        print(f"任务出错: 第 {counter}/{total_tasks} 个任务出错, {exc}")
-                # 清空当前批次任务
-                future_tasks = {}
-
-
-def ad_download() -> None:
-    max_workers = 24
-    sql_file_path = "/Users/zhao/Desktop/tzld/ad/sql/特征平均值.sql"
-    dts = ["20241206",
-           "20241207",
-           "20241208",
-           "20241209",
-           "20241210",
-           "20241211",
-           "20241212",
-           "20241213",
-           "20241214",
-           "20241215",
-           "20241216"]
-
-    def create_task(dt: str) -> Callable[[], None]:
-        def task() -> None:
-            params = {
-                "dt_1": dt,
-                "dt_2": dt
-            }
-            result_file_path = f"/Users/zhao/Desktop/tzld/ad/特征/{dt}.csv"
-            print(f"准备任务: {dt}")
-            odps_client.execute_sql_file_result_save_fle(
-                sql_file_path,
-                params,
-                result_file_path
-            )
-
-        return task
-
-    tasks = [create_task(dt) for dt in dts]
-
-    process_tasks(tasks, max_workers)
-    print("数据下载完成。")
-
-
-def _main():
-    ad_download()
-
-
-if __name__ == "__main__":
-    _main()

+ 0 - 0
script/t.py


+ 0 - 0
vov/__init__.py


+ 0 - 151
vov/vov_h0_train.py

@@ -1,151 +0,0 @@
-import numpy as np
-import pandas as pd
-from scipy.optimize import minimize
-from sklearn.metrics import r2_score
-from sklearn.model_selection import train_test_split
-import pickle
-
-
-# 1. 加载数据
-def load_data(file_path):
-    df = pd.read_csv(file_path, na_values='\\N')
-    return df
-
-# 2. 数据预处理
-def preprocess_data(df, features, target, exposure_col, top_k):
-    # 按曝光量排序并选择 Top k 数据
-    df_sorted = df.sort_values(by=exposure_col, ascending=False)
-    df_topk = df_sorted.head(top_k)
-
-    X = df_topk[features]
-    y = df_topk[target]
-
-    # 获取 Top K 对应的曝光阈值
-    exposure_threshold = df_topk[exposure_col].min()
-
-    return X, y, exposure_threshold,df_topk
-
-# 3. 计算相关系数
-def calculate_correlations(df, features, target):
-    correlations = {}
-    for feature in features:
-        # 删除 target 或 feature 列中任一为空的行
-        valid_data = df[[target, feature]].dropna()
-
-        # 如果没有有效数据,相关系数设为 0
-        if len(valid_data) == 0:
-            correlations[feature] = 0
-        else:
-            # 计算相关系数
-            corr = valid_data[target].corr(valid_data[feature])
-            correlations[feature] = corr if not np.isnan(corr) else 0
-
-    # 转换为 Series 并按绝对值大小排序
-    corr_series = pd.Series(correlations).abs().sort_values(ascending=False)
-    return corr_series
-
-
-# 4. 定义动态加权和函数
-def dynamic_weighted_sum(features, weights):
-    valid_features = ~np.isnan(features)
-    if np.sum(valid_features) == 0:
-        return np.nan
-    normalized_weights = weights[valid_features] / np.sum(weights[valid_features])
-    return np.sum(features[valid_features] * normalized_weights)
-
-# 5. 定义损失函数
-def mse_loss(y_true, y_pred):
-    valid = ~np.isnan(y_true) & ~np.isnan(y_pred)
-    return np.mean((y_true[valid] - y_pred[valid])**2)
-
-# 6. 定义目标函数
-def objective(weights, X, y_true):
-    y_pred = np.array([dynamic_weighted_sum(x, weights) for x in X.values])
-    return mse_loss(y_true, y_pred)
-
-# 7. 搜索最佳权重
-def find_best_weights(X, y, initial_weights):
-    result = minimize(objective, initial_weights, args=(X, y), method='Nelder-Mead')
-    return result.x
-
-# 8. 评估模型
-def evaluate_model(X, y, weights):
-    y_pred = np.array([dynamic_weighted_sum(x, weights) for x in X.values])
-    valid = ~np.isnan(y) & ~np.isnan(y_pred)
-    r2 = r2_score(y[valid], y_pred[valid])
-    mse = mse_loss(y, y_pred)
-    return r2, mse
-
-# 9. 保存模型
-def save_model(weights, features, exposure_threshold,top_k, file_path):
-    model = {
-        'weights': weights,
-        'features': features,
-        'exposure_threshold': exposure_threshold,
-        'top_k':top_k
-    }
-    with open(file_path, 'wb') as f:
-        pickle.dump(model, f)
-
-# 10. 加载模型
-def load_model(file_path):
-    with open(file_path, 'rb') as f:
-        model = pickle.load(f)
-    return model['weights'], model['features'], model['exposure_threshold'],model['top_k']
-
-
-
-# 12. 主函数
-def main():
-    # 加载数据
-    df = load_data('train_20240921.csv')
-
-    # 定义特征、目标变量和曝光量列
-    features = ['h1_ago_vov', 'h2_ago_vov', 'h3_ago_vov', 'h24_ago_vov', 'h48_ago_vov', 'd1_ago_vov', 'd2_ago_vov']
-    target = 'cur_hour_vov'
-    exposure_col = 'h1_ago_view'  # 请确保你的数据中有这个列
-    top_k = 1000  # 设置你想要使用的 Top k 数据点数量
-
-    # 预处理数据
-    X, y, exposure_threshold,df_topk = preprocess_data(df, features, target, exposure_col, top_k)
-
-    # 划分训练集和测试集
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
-
-    # 计算相关系数
-    correlations = calculate_correlations(df_topk, features, target)
-    print("Feature correlations:")
-    print(correlations)
-
-    # 使用相关系数作为初始权重
-    initial_weights = correlations[features].values
-
-    # 搜索最佳权重
-    best_weights = find_best_weights(X_train, y_train, initial_weights)
-
-    # 评估模型
-    r2_train, mse_train = evaluate_model(X_train, y_train, best_weights)
-    r2_test, mse_test = evaluate_model(X_test, y_test, best_weights)
-
-    print(f"\nTrain R² Score: {r2_train:.4f}, MSE: {mse_train:.4f}")
-    print(f"Test R² Score: {r2_test:.4f}, MSE: {mse_test:.4f}")
-
-    # 输出特征重要性
-    print("\nFeature importance:")
-    for feature, weight in zip(features, best_weights):
-        print(f"{feature}: {weight:.4f}")
-
-    # 保存模型
-    save_model(pd.Series(best_weights, index=features), features, exposure_threshold,top_k, 'top'+str(top_k)+'_linear_weighted_model.pkl')
-
-    # 测试加载模型
-    loaded_weights, loaded_features, loaded_threshold,topk = load_model('top'+str(top_k)+'_linear_weighted_model.pkl')
-    print("\nLoaded model weights:")
-    for feature, weight in loaded_weights.items():
-        print(f"{feature}: {weight:.4f}")
-    print(f"Exposure threshold: {loaded_threshold}")
-    print(f"TopK: {topk}")
-
-
-if __name__ == "__main__":
-    main()

+ 0 - 192
vov/vov_h24_analyse.py

@@ -1,192 +0,0 @@
-import numpy as np
-import pandas as pd
-from scipy.optimize import minimize
-from sklearn.metrics import r2_score
-from sklearn.model_selection import train_test_split
-import pickle
-
-all_feature_names = ["1_vovh0", "2_vovh0", "2_vovh1", "3_vovh0", "3_vovh1", "3_vovh2", "4_vovh0", "4_vovh1", "4_vovh2",
-                     "4_vovh3", "6_vovh0", "6_vovh1", "6_vovh6", "12_vovh0", "12_vovh1", "12_vovh12", "24_vovh0",
-                     "24_vovh1", "24_vovh2", "24_vovh3", "24_vovh6", "24_vovh12", "24_vovh24", "48_vovh0", "48_vovh1",
-                     "48_vovh2", "48_vovh3", "48_vovh6", "48_vovh12", "48_vovh24", "48_vovh48", "1_vovd0", "2_vovd0",
-                     "3_vovd0", "4_vovd0", "5_vovd0", "2_vovd1", "3_vovd1", "4_vovd1", "5_vovd1", "3_vovd2", "4_vovd2",
-                     "5_vovd2", "1_vovh_分母", "1_vovh0分子", "2_vovh_分母", "2_vovh0分子", "2_vovh1分子",
-                     "3_vovh_分母", "3_vovh0分子", "3_vovh1分子", "3_vovh2分子", "4_vovh_分母", "4_vovh0分子",
-                     "4_vovh1分子", "4_vovh2分子", "4_vovh3分子", "6_vovh_分母", "6_vovh0分子", "6_vovh1分子",
-                     "6_vovh6分子", "12_vovh_分母", "12_vovh0分子", "12_vovh1分子", "12_vovh12分子", "24_vovh_分母",
-                     "24_vovh0分子", "24_vovh1分子", "24_vovh2分子", "24_vovh3分子", "24_vovh6分子", "24_vovh12分子",
-                     "24_vovh24分子", "48_vovh_分母", "48_vovh0分子", "48_vovh1分子", "48_vovh2分子", "48_vovh3分子",
-                     "48_vovh6分子", "48_vovh12分子", "48_vovh24分子", "48_vovh48分子", "1_vovd0_分母", "1_vovd0_分子",
-                     "2_vovd0_分母", "2_vovd0_分子", "3_vovd0_分母", "3_vovd0_分子", "4_vovd0_分母", "4_vovd0_分子",
-                     "5_vovd0_分母", "5_vovd0_分子", "2_vovd1_分母", "2_vovd1_分子", "3_vovd1_分母", "3_vovd1_分子",
-                     "4_vovd1_分母", "4_vovd1_分子", "5_vovd1_分母", "5_vovd1_分子", "3_vovd2_分母", "3_vovd2_分子",
-                     "4_vovd2_分母", "4_vovd2_分子", "5_vovd2_分母", "5_vovd2_分子"]
-
-# feature_names = ["1_vovh0",
-#                  "2_vovh0", "2_vovh1",
-#                  "3_vovh0", "3_vovh1", "3_vovh2",
-#                  "4_vovh0", "4_vovh1", "4_vovh2", "4_vovh3",
-#                  "6_vovh0", "6_vovh1", "6_vovh6",
-#                  "12_vovh0", "12_vovh1", "12_vovh12",
-#                  "24_vovh0", "24_vovh1", "24_vovh2", "24_vovh3", "24_vovh6", "24_vovh12", "24_vovh24",
-#                  "48_vovh0", "48_vovh1", "48_vovh2", "48_vovh3", "48_vovh6", "48_vovh12", "48_vovh24", "48_vovh48",
-#                  "1_vovd0", "2_vovd0", "3_vovd0",
-#                  "2_vovd1", "3_vovd1"
-#                  ]
-
-feature_names = ["1_vovh0",
-                 "2_vovh0", "2_vovh1",
-                 "3_vovh1", "3_vovh2",
-                 "4_vovh1", "4_vovh3",
-                 "6_vovh1", "6_vovh6",
-                 "12_vovh1", "12_vovh12",
-                 "24_vovh1", "24_vovh2", "24_vovh3", "24_vovh6", "24_vovh12", "24_vovh24",
-                 "48_vovh1", "48_vovh2", "48_vovh3", "48_vovh6", "48_vovh12", "48_vovh24", "48_vovh48",
-                 "1_vovd0",
-                 "2_vovd1", "3_vovd1"
-                 ]
-
-dt_list = ['20241014', '20241015', '20241016']
-hh_list = ["00", "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12",
-           "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23"]
-
-pd.set_option('display.max_rows', None)  # 显示所有行
-pd.set_option('display.max_columns', None)  # 显示所有列
-
-
-# 1. 加载数据
-def load_data(filepath: str) -> pd.DataFrame:
-    return pd.read_csv(filepath)
-
-
-# 2. 数据预处理
-def preprocess_data(df, features, target):
-    df_sorted = df.sort_values(by=target, ascending=False)
-    x = df_sorted[features]
-    y = df_sorted[target]
-
-    top_k = df_sorted.head(100)
-
-    return x, y, top_k
-
-
-# 3. 计算相关系数
-def calculate_correlations(df, features, target):
-    correlations = {}
-    for feature in features:
-        # 删除 target 或 feature 列中任一为空的行
-        valid_data = df[[target, feature]].dropna()
-
-        # 如果没有有效数据,相关系数设为 0
-        if len(valid_data) == 0:
-            correlations[feature] = 0
-        else:
-            # 计算相关系数
-            corr = valid_data[target].corr(valid_data[feature])
-            correlations[feature] = corr if not np.isnan(corr) else 0
-
-    # 转换为 Series 并按绝对值大小排序
-    corr_series = pd.Series(correlations).abs().sort_values(ascending=False)
-    return corr_series
-
-
-# 4. 定义动态加权和函数
-def dynamic_weighted_sum(features, weights):
-    valid_features = ~np.isnan(features)
-    if np.sum(valid_features) == 0:
-        return np.nan
-    normalized_weights = weights[valid_features] / np.sum(weights[valid_features])
-    return np.sum(features[valid_features] * normalized_weights)
-
-
-# 5. 定义损失函数
-def mse_loss(y_true, y_pred):
-    valid = ~np.isnan(y_true) & ~np.isnan(y_pred)
-    return np.mean((y_true[valid] - y_pred[valid]) ** 2)
-
-
-# 6. 定义目标函数
-def objective(weights, X, y_true):
-    y_pred = np.array([dynamic_weighted_sum(x, weights) for x in X.values])
-    return mse_loss(y_true, y_pred)
-
-
-# 7. 搜索最佳权重
-def find_best_weights(X, y, initial_weights):
-    result = minimize(objective, initial_weights, args=(X, y), method='Nelder-Mead')
-    return result.x
-
-
-# 8. 评估模型
-def evaluate_model(X, y, weights):
-    y_pred = np.array([dynamic_weighted_sum(x, weights) for x in X.values])
-    valid = ~np.isnan(y) & ~np.isnan(y_pred)
-    r2 = r2_score(y[valid], y_pred[valid])
-    mse = mse_loss(y, y_pred)
-    return r2, mse
-
-
-# 9. 保存模型
-def save_model(weights, features, file_path):
-    model = {
-        'weights': weights,
-        'features': features,
-    }
-    with open(file_path, 'wb') as f:
-        pickle.dump(model, f)
-
-
-# 10. 加载模型
-def load_model(file_path):
-    with open(file_path, 'rb') as f:
-        model = pickle.load(f)
-    return model['weights'], model['features']
-
-
-def single_dt_handle(dt, df: pd.DataFrame):
-    x, y, top_key = preprocess_data(df, feature_names, "vovh24")
-    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
-    correl = calculate_correlations(top_key, feature_names, "vovh24")
-    print(f"{dt}   Feature Correlations: ")
-    print(correl.head(5))
-
-    initial_weights = correl[feature_names].values
-    best_weights = find_best_weights(x_train, y_train, initial_weights)
-    # 评估模型
-    r2_train, mse_train = evaluate_model(x_train, y_train, best_weights)
-    r2_test, mse_test = evaluate_model(x_test, y_test, best_weights)
-
-    print(f"\nTrain R² Score: {r2_train:.4f}, MSE: {mse_train:.4f}")
-    print(f"Test R² Score: {r2_test:.4f}, MSE: {mse_test:.4f}")
-
-    # 输出特征重要性
-    print("\nFeature importance:")
-    for feature, weight in zip(feature_names, best_weights):
-        print(f"{feature}: {weight:.4f}")
-
-    # 保存模型
-    save_model(pd.Series(best_weights, index=feature_names), feature_names,
-               '/Users/zhao/Desktop/vov/model/vovh24_linear_weighted_model.pkl')
-
-    # 测试加载模型
-    loaded_weights, loaded_features = load_model('/Users/zhao/Desktop/vov/model/vovh24_linear_weighted_model.pkl')
-    print("\nLoaded model weights:")
-    for feature, weight in loaded_weights.items():
-        print(f"{feature}: {weight:.4f}")
-
-
-def _main():
-    df_dict = {}
-    for dt in dt_list:
-        for hh in hh_list:
-            key = f"{dt}{hh}"
-            df = load_data(f"/Users/zhao/Desktop/vov/{key}.csv")
-            df_dict[key] = df
-
-    for key in df_dict:
-        single_dt_handle(key, df_dict[key])
-        return
-
-
-if __name__ == '__main__':
-    _main()

この差分においてかなりの量のファイルが変更されているため、一部のファイルを表示していません