há 1 ano atrás · cbf0eb2d6d
--- a/ad_predict_video_data_process.py
+++ b/ad_predict_video_data_process.py
@@ -0,0 +1,133 @@
 
				+import os.path
			
 
				+import time
			
 
				+import datetime
			
 
				+import pandas as pd
			
 
				+from odps import ODPS
			
 
				+
			
 
				+# ODPS服务配置
			
 
				+odps_config = {
			
 
				+    'ENDPOINT': 'http://service.cn.maxcompute.aliyun.com/api',
			
 
				+    'ACCESSID': 'LTAIWYUujJAm7CbH',
			
 
				+    'ACCESSKEY': 'RfSjdiWwED1sGFlsjXv0DlfTnZTG1P',
			
 
				+}
			
 
				+
			
 
				+features = [
			
 
				+    'apptype',
			
 
				+    'videoid',
			
 
				+    'video_preview_count_uv_30day',
			
 
				+    'video_preview_count_pv_30day',
			
 
				+    'video_view_count_uv_30day',
			
 
				+    'video_view_count_pv_30day',
			
 
				+    'video_play_count_uv_30day',
			
 
				+    'video_play_count_pv_30day',
			
 
				+    'video_share_count_uv_30day',
			
 
				+    'video_share_count_pv_30day',
			
 
				+    'video_return_count_30day',
			
 
				+    'video_ctr_uv_30day',
			
 
				+    'video_ctr_pv_30day',
			
 
				+    'video_share_rate_uv_30day',
			
 
				+    'video_share_rate_pv_30day',
			
 
				+    'video_return_rate_30day',
			
 
				+]
			
 
				+
			
 
				+
			
 
				+def get_feature_data(project, table, dt, app_type):
			
 
				+    """获取特征数据"""
			
 
				+    odps = ODPS(
			
 
				+        access_id=odps_config['ACCESSID'],
			
 
				+        secret_access_key=odps_config['ACCESSKEY'],
			
 
				+        project=project,
			
 
				+        endpoint=odps_config['ENDPOINT'],
			
 
				+    )
			
 
				+    feature_data = []
			
 
				+    sql = f"select * from {project}.{table} where dt={dt} and apptype={app_type}"
			
 
				+    with odps.execute_sql(sql).open_reader() as reader:
			
 
				+        for record in reader:
			
 
				+            # print(record)
			
 
				+            item = {}
			
 
				+            for feature_name in features:
			
 
				+                item[feature_name] = record[feature_name]
			
 
				+            feature_data.append(item)
			
 
				+        feature_df = pd.DataFrame(feature_data)
			
 
				+        return feature_df
			
 
				+
			
 
				+
			
 
				+def user_data_process(project, table, dt, app_type):
			
 
				+    """每日特征处理"""
			
 
				+    print('step 1: get video feature data')
			
 
				+    feature_initial_df = get_feature_data(project=project, table=table, dt=dt, app_type=app_type)
			
 
				+    print(f"feature_initial_df shape: {feature_initial_df.shape}")
			
 
				+    print('step 2: process')
			
 
				+    feature_initial_df['apptype'] = feature_initial_df['apptype'].astype(int)
			
 
				+    feature_df = feature_initial_df.copy()
			
 
				+    # 缺失值填充
			
 
				+    feature_df.fillna(0, inplace=True)
			
 
				+    # 数据类型校正
			
 
				+    type_int_columns = [
			
 
				+        'video_preview_count_uv_30day',
			
 
				+        'video_preview_count_pv_30day',
			
 
				+        'video_view_count_uv_30day',
			
 
				+        'video_view_count_pv_30day',
			
 
				+        'video_play_count_uv_30day',
			
 
				+        'video_play_count_pv_30day',
			
 
				+        'video_share_count_uv_30day',
			
 
				+        'video_share_count_pv_30day',
			
 
				+        'video_return_count_30day',
			
 
				+    ]
			
 
				+    for column_name in type_int_columns:
			
 
				+        feature_df[column_name].astype(int)
			
 
				+    type_float_columns = [
			
 
				+        'video_ctr_uv_30day',
			
 
				+        'video_ctr_pv_30day',
			
 
				+        'video_share_rate_uv_30day',
			
 
				+        'video_share_rate_pv_30day',
			
 
				+        'video_return_rate_30day',
			
 
				+    ]
			
 
				+    for column_name in type_float_columns:
			
 
				+        feature_df[column_name].astype(float)
			
 
				+    print(f"feature_df shape: {feature_df.shape}")
			
 
				+    print('step 3: add new video feature')
			
 
				+    # 补充新用户默认数据（使用均值）
			
 
				+    new_video_feature = {
			
 
				+        'apptype': app_type,
			
 
				+        'videoid': '-1',
			
 
				+        'video_preview_count_uv_30day': int(feature_df['video_preview_count_uv_30day'].mean()),
			
 
				+        'video_preview_count_pv_30day': int(feature_df['video_preview_count_pv_30day'].mean()),
			
 
				+        'video_view_count_uv_30day': int(feature_df['video_view_count_uv_30day'].mean()),
			
 
				+        'video_view_count_pv_30day': int(feature_df['video_view_count_pv_30day'].mean()),
			
 
				+        'video_play_count_uv_30day': int(feature_df['video_play_count_uv_30day'].mean()),
			
 
				+        'video_play_count_pv_30day': int(feature_df['video_play_count_pv_30day'].mean()),
			
 
				+        'video_share_count_uv_30day': int(feature_df['video_share_count_uv_30day'].mean()),
			
 
				+        'video_share_count_pv_30day': int(feature_df['video_share_count_pv_30day'].mean()),
			
 
				+        'video_return_count_30day': int(feature_df['video_return_count_30day'].mean()),
			
 
				+    }
			
 
				+    new_video_feature['video_ctr_uv_30day'] = float(
			
 
				+        new_video_feature['video_play_count_uv_30day'] / new_video_feature['video_view_count_uv_30day'])
			
 
				+    new_video_feature['video_ctr_pv_30day'] = float(
			
 
				+        new_video_feature['video_play_count_pv_30day'] / new_video_feature['video_view_count_pv_30day'])
			
 
				+    new_video_feature['video_share_rate_uv_30day'] = float(
			
 
				+        new_video_feature['video_share_count_uv_30day'] / new_video_feature['video_play_count_uv_30day'])
			
 
				+    new_video_feature['video_share_rate_pv_30day'] = float(
			
 
				+        new_video_feature['video_share_count_pv_30day'] / new_video_feature['video_play_count_pv_30day'])
			
 
				+    new_video_feature['video_return_rate_30day'] = float(
			
 
				+        new_video_feature['video_return_count_30day'] / new_video_feature['video_view_count_pv_30day'])
			
 
				+    new_video_feature_df = pd.DataFrame([new_video_feature])
			
 
				+    video_df = pd.concat([feature_df, new_video_feature_df])
			
 
				+    print(f"video_df shape: {video_df.shape}")
			
 
				+    print(f"step 4: to csv")
			
 
				+    # 写入csv
			
 
				+    predict_data_dir = './data/predict_data'
			
 
				+    if not os.path.exists(predict_data_dir):
			
 
				+        os.makedirs(predict_data_dir)
			
 
				+    video_df.to_csv(f"{predict_data_dir}/video_feature.csv", index=False)
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    st_time = time.time()
			
 
				+    project = 'loghubods'
			
 
				+    table = 'admodel_testset_video'
			
 
				+    # dt = '20230725'
			
 
				+    now_date = datetime.datetime.today()
			
 
				+    dt = datetime.datetime.strftime(now_date - datetime.timedelta(days=1), '%Y%m%d')
			
 
				+    user_data_process(project=project, table=table, dt=dt, app_type=0)
			
 
				+    print(time.time() - st_time)
			
--- a/ad_xgboost_predict.py
+++ b/ad_xgboost_predict.py
@@ -0,0 +1,26 @@
 
				+import os
			
 
				+import pandas as pd
			
 
				+import xgboost as xgb
			
 
				+from xgboost.sklearn import XGBClassifier
			
 
				+
			
 
				+
			
 
				+# 1. 模型加载
			
 
				+model = XGBClassifier()
			
 
				+booster = xgb.Booster()
			
 
				+booster.load_model('./data/ad_xgb.model')
			
 
				+model._Booster = booster
			
 
				+# 2. 预测：ad_status = 0, 不出广告
			
 
				+df_0 = pd.read_csv('./data/predict_data/predict_data_0.csv')
			
 
				+columns_0 = df_0.columns.values.tolist()
			
 
				+y_pred_proba_0 = model.predict_proba(df_0[columns_0[2:]])
			
 
				+df_0['y_0'] = y_pred_proba_0
			
 
				+
			
 
				+# 3. 预测：ad_status = 1, 不出广告
			
 
				+df_1 = pd.read_csv('./data/predict_data/predict_data_1.csv')
			
 
				+columns_1 = df_1.columns.values.tolist()
			
 
				+y_pred_proba_1 = model.predict_proba(df_1[columns_1[2:]])
			
 
				+df_0['y_1'] = y_pred_proba_1
			
 
				+
			
 
				+# 4. merge 结果
			
 
				+res_df = pd.merge(df_0, df_1, how='left', on=['apptype', 'mid', 'videoid'])
			
 
				+res_df['res_predict'] = res_df['y_0'] - res_df['y_1']
			
--- a/ad_xgboost_predict_data_generate.py
+++ b/ad_xgboost_predict_data_generate.py
@@ -0,0 +1,68 @@
 
				+import os
			
 
				+import pandas as pd
			
 
				+
			
 
				+predict_data_dir = './data/predict_data'
			
 
				+user_filename = 'user_feature.csv'
			
 
				+video_filename = 'video_feature.csv'
			
 
				+
			
 
				+
			
 
				+def read_csv_data(filepath):
			
 
				+    if os.path.exists(filepath):
			
 
				+        data = pd.read_csv(filepath, sep=',', engine='python', iterator=True)
			
 
				+        chunk_size = 1000000
			
 
				+        chunks = []
			
 
				+        loop = True
			
 
				+        while loop:
			
 
				+            try:
			
 
				+                chunk_data = data.get_chunk(chunk_size)
			
 
				+                chunks.append(chunk_data)
			
 
				+            except StopIteration:
			
 
				+                loop = False
			
 
				+        df = pd.concat(chunks, ignore_index=True)
			
 
				+        return df
			
 
				+    else:
			
 
				+        print("Don't have this file!")
			
 
				+        return None
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    # 1. 获取用户特征数据
			
 
				+    user_filepath = f"{predict_data_dir}/{user_filename}"
			
 
				+    user_df = read_csv_data(filepath=user_filepath)
			
 
				+    # 2. 获取视频特征数据
			
 
				+    video_filepath = f"{predict_data_dir}/{video_filename}"
			
 
				+    video_df = read_csv_data(filepath=video_filepath)
			
 
				+    # 3. 用户特征和视频特征进行拼接
			
 
				+    video_features = [
			
 
				+        'videoid',
			
 
				+        'video_preview_count_uv_30day',
			
 
				+        'video_preview_count_pv_30day',
			
 
				+        'video_view_count_uv_30day',
			
 
				+        'video_view_count_pv_30day',
			
 
				+        'video_play_count_uv_30day',
			
 
				+        'video_play_count_pv_30day',
			
 
				+        'video_share_count_uv_30day',
			
 
				+        'video_share_count_pv_30day',
			
 
				+        'video_return_count_30day',
			
 
				+        'video_ctr_uv_30day',
			
 
				+        'video_ctr_pv_30day',
			
 
				+        'video_share_rate_uv_30day',
			
 
				+        'video_share_rate_pv_30day',
			
 
				+        'video_return_rate_30day',
			
 
				+    ]
			
 
				+    merge_df_list = []
			
 
				+    for ind, row in video_df.iterrows():
			
 
				+        merge_df_temp = user_df.copy()
			
 
				+        for feature in video_features:
			
 
				+            merge_df_temp[feature] = row[feature]
			
 
				+        merge_df_list.append(merge_df_temp)
			
 
				+    merge_df = pd.concat(merge_df_list, ignore_index=True)
			
 
				+    # 4. 拼接广告特征ad_status
			
 
				+    for ad_status in [0, 1]:
			
 
				+        res_df = merge_df.copy()
			
 
				+        res_df['ad_status'] = ad_status
			
 
				+        # 写入csv
			
 
				+        predict_data_dir = './data/predict_data'
			
 
				+        if not os.path.exists(predict_data_dir):
			
 
				+            os.makedirs(predict_data_dir)
			
 
				+        res_df.to_csv(f"{predict_data_dir}/predict_data_{ad_status}.csv", index=False)