123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236 |
- #coding utf-8
- from tqdm import tqdm
- import sys
- import json
- import traceback
- from threading import Timer
- from utils import RedisHelper, data_check, get_feature_data, send_msg_to_feishu
- from config import set_config
- from log import Log
- config_, _ = set_config()
- log_ = Log()
- redis_helper = RedisHelper()
- from feature import get_item_features
- from lr_model import LrModel
- from utils import exe_sql
- if __name__ == "__main__":
- project = 'loghubods'
- datetime = sys.argv[1]
- sql = """
- --odps sql
- --********************************************************************--
- --author:研发
- --create time:2023-12-01 15:48:17
- --********************************************************************--
- with candidate as (
- select
- -- 基础特征_用户
- mid AS u_id
- ,machineinfo_brand AS u_brand
- ,machineinfo_model AS u_device
- ,SPLIT(machineinfo_system,' ')[0] AS u_system
- ,machineinfo_system AS u_system_ver
- -- 基础特征_视频
- ,videoid AS i_id
- ,i_up_id AS i_up_id
- ,tags as i_tag
- ,title as i_title
- ,ceil(log2(i_title_len + 1)) as i_title_len
- ,ceil(log2(total_time + 1)) as i_play_len
- ,ceil(log2(i_days_since_upload + 1)) as i_days_since_upload -- 发布时间(距离现在天数)
- -- 基础特征_场景
- ,ctx_day AS ctx_day
- ,ctx_week AS ctx_week
- ,ctx_hour AS ctx_hour
- ,ctx_region as ctx_region
- ,ctx_city as ctx_city
- -- 基础特征_交叉
- ,ui_is_out as ui_is_out
- ,i_play_len as playtime
- -- ,IF(i_play_len > 1,'0','1') AS ui_is_out_new
- ,rootmid AS ui_root_id
- ,shareid AS ui_share_id
- -- 统计特征_用户
- ,ceil(log2(u_1day_exp_cnt + 1)) as u_1day_exp_cnt
- ,ceil(log2(u_1day_click_cnt + 1)) as u_1day_click_cnt
- ,ceil(log2(u_1day_share_cnt + 1)) as u_1day_share_cnt
- ,ceil(log2(u_1day_return_cnt + 1)) as u_1day_return_cnt
- ,ceil(log2(u_3day_exp_cnt + 1)) as u_3day_exp_cnt
- ,ceil(log2(u_3day_click_cnt + 1)) as u_3day_click_cnt
- ,ceil(log2(u_3day_share_cnt + 1)) as u_3day_share_cnt
- ,ceil(log2(u_3day_return_cnt + 1)) as u_3day_return_cnt
- ,ceil(log2(u_7day_exp_cnt + 1)) as u_7day_exp_cnt
- ,ceil(log2(u_7day_click_cnt + 1)) as u_7day_click_cnt
- ,ceil(log2(u_7day_share_cnt + 1)) as u_7day_share_cnt
- ,ceil(log2(u_7day_return_cnt + 1)) as u_7day_return_cnt
- ,ceil(log2(u_3month_exp_cnt + 1)) as u_3month_exp_cnt
- ,ceil(log2(u_3month_click_cnt + 1)) as u_3month_click_cnt
- ,ceil(log2(u_3month_share_cnt + 1)) as u_3month_share_cnt
- ,ceil(log2(u_3month_return_cnt + 1)) as u_3month_return_cnt
- ,round(if(u_ctr_1day > 10.0, 10.0, u_ctr_1day) / 10.0, 6) as u_ctr_1day
- ,round(if(u_str_1day > 10.0, 10.0, u_str_1day) / 10.0, 6) as u_str_1day
- ,round(if(u_rov_1day > 10.0, 10.0, u_rov_1day) / 10.0, 6) as u_rov_1day
- ,round(if(u_ros_1day > 10.0, 10.0, u_ros_1day) / 10.0, 6) as u_ros_1day
- ,round(if(u_ctr_3day > 10.0, 10.0, u_ctr_3day) / 10.0, 6) as u_ctr_3day
- ,round(if(u_str_3day > 10.0, 10.0, u_str_3day) / 10.0, 6) as u_str_3day
- ,round(if(u_rov_3day > 10.0, 10.0, u_rov_3day) / 10.0, 6) as u_rov_3day
- ,round(if(u_ros_3day > 10.0, 10.0, u_ros_3day) / 10.0, 6) as u_ros_3day
- ,round(if(u_ctr_7day > 10.0, 10.0, u_ctr_7day) / 10.0, 6) as u_ctr_7day
- ,round(if(u_str_7day > 10.0, 10.0, u_str_7day) / 10.0, 6) as u_str_7day
- ,round(if(u_rov_7day > 10.0, 10.0, u_rov_7day) / 10.0, 6) as u_rov_7day
- ,round(if(u_ros_7day > 10.0, 10.0, u_ros_7day) / 10.0, 6) as u_ros_7day
- ,round(if(u_ctr_3month > 10.0, 10.0, u_ctr_3month) / 10.0, 6) as u_ctr_3month
- ,round(if(u_str_3month > 10.0, 10.0, u_str_3month) / 10.0, 6) as u_str_3month
- ,round(if(u_rov_3month > 10.0, 10.0, u_rov_3month) / 10.0, 6) as u_rov_3month
- ,round(if(u_ros_3month > 10.0, 10.0, u_ros_3month) / 10.0, 6) as u_ros_3month
- -- 统计特征_视频
- ,ceil(log2(i_1day_exp_cnt + 1)) as i_1day_exp_cnt
- ,ceil(log2(i_1day_click_cnt + 1)) as i_1day_click_cnt
- ,ceil(log2(i_1day_share_cnt + 1)) as i_1day_share_cnt
- ,ceil(log2(i_1day_return_cnt + 1)) as i_1day_return_cnt
- ,ceil(log2(i_3day_exp_cnt + 1)) as i_3day_exp_cnt
- ,ceil(log2(i_3day_click_cnt + 1)) as i_3day_click_cnt
- ,ceil(log2(i_3day_share_cnt + 1)) as i_3day_share_cnt
- ,ceil(log2(i_3day_return_cnt + 1)) as i_3day_return_cnt
- ,ceil(log2(i_7day_exp_cnt + 1)) as i_7day_exp_cnt
- ,ceil(log2(i_7day_click_cnt + 1)) as i_7day_click_cnt
- ,ceil(log2(i_7day_share_cnt + 1)) as i_7day_share_cnt
- ,ceil(log2(i_7day_return_cnt + 1)) as i_7day_return_cnt
- ,ceil(log2(i_3month_exp_cnt + 1)) as i_3month_exp_cnt
- ,ceil(log2(i_3month_click_cnt + 1)) as i_3month_click_cnt
- ,ceil(log2(i_3month_share_cnt + 1)) as i_3month_share_cnt
- ,ceil(log2(i_3month_return_cnt + 1)) as i_3month_return_cnt
- ,round(if(i_ctr_1day > 10.0, 10.0, i_ctr_1day) / 10.0, 6) as i_ctr_1day
- ,round(if(i_str_1day > 10.0, 10.0, i_str_1day) / 10.0, 6) as i_str_1day
- ,round(if(i_rov_1day > 10.0, 10.0, i_rov_1day) / 10.0, 6) as i_rov_1day
- ,round(if(i_ros_1day > 10.0, 10.0, i_ros_1day) / 10.0, 6) as i_ros_1day
- ,round(if(i_ctr_3day > 10.0, 10.0, i_ctr_3day) / 10.0, 6) as i_ctr_3day
- ,round(if(i_str_3day > 10.0, 10.0, i_str_3day) / 10.0, 6) as i_str_3day
- ,round(if(i_rov_3day > 10.0, 10.0, i_rov_3day) / 10.0, 6) as i_rov_3day
- ,round(if(i_ros_3day > 10.0, 10.0, i_ros_3day) / 10.0, 6) as i_ros_3day
- ,round(if(i_ctr_7day > 10.0, 10.0, i_ctr_7day) / 10.0, 6) as i_ctr_7day
- ,round(if(i_str_7day > 10.0, 10.0, i_str_7day) / 10.0, 6) as i_str_7day
- ,round(if(i_rov_7day > 10.0, 10.0, i_rov_7day) / 10.0, 6) as i_rov_7day
- ,round(if(i_ros_7day > 10.0, 10.0, i_ros_7day) / 10.0, 6) as i_ros_7day
- ,round(if(i_ctr_3month > 10.0, 10.0, i_ctr_3month) / 10.0, 6) as i_ctr_3month
- ,round(if(i_str_3month > 10.0, 10.0, i_str_3month) / 10.0, 6) as i_str_3month
- ,round(if(i_rov_3month > 10.0, 10.0, i_rov_3month) / 10.0, 6) as i_rov_3month
- ,round(if(i_ros_3month > 10.0, 10.0, i_ros_3month) / 10.0, 6) as i_ros_3month
- from
- user_video_features_data_final
- where dt='{datetime}'
- and ad_ornot = '0'
- and apptype != '13'
- ), candidate_user as (
- SELECT
- u_id,
- max(u_1day_exp_cnt) as u_1day_exp_cnt,
- max(u_1day_click_cnt) as u_1day_click_cnt,
- max(u_1day_share_cnt) as u_1day_share_cnt,
- max(u_1day_return_cnt) as u_1day_return_cnt,
- max(u_3day_exp_cnt) as u_3day_exp_cnt,
- max(u_3day_click_cnt) as u_3day_click_cnt,
- max(u_3day_share_cnt) as u_3day_share_cnt,
- max(u_3day_return_cnt) as u_3day_return_cnt,
- max(u_7day_exp_cnt) as u_7day_exp_cnt,
- max(u_7day_click_cnt) as u_7day_click_cnt,
- max(u_7day_share_cnt) as u_7day_share_cnt,
- max(u_7day_return_cnt) as u_7day_return_cnt,
- max(u_3month_exp_cnt) as u_3month_exp_cnt,
- max(u_3month_click_cnt) as u_3month_click_cnt,
- max(u_3month_share_cnt) as u_3month_share_cnt,
- max(u_3month_return_cnt) as u_3month_return_cnt,
- max(u_ctr_1day) as u_ctr_1day,
- max(u_str_1day) as u_str_1day,
- max(u_rov_1day) as u_rov_1day,
- max(u_ros_1day) as u_ros_1day,
- max(u_ctr_3day) as u_ctr_3day,
- max(u_str_3day) as u_str_3day,
- max(u_rov_3day) as u_rov_3day,
- max(u_ros_3day) as u_ros_3day,
- max(u_ctr_7day) as u_ctr_7day,
- max(u_str_7day) as u_str_7day,
- max(u_rov_7day) as u_rov_7day,
- max(u_ros_7day) as u_ros_7day,
- max(u_ctr_3month) as u_ctr_3month,
- max(u_str_3month) as u_str_3month,
- max(u_rov_3month) as u_rov_3month,
- max(u_ros_3month) as u_ros_3month
- FROM
- candidate
- group by u_id
- ), candidate_item as (
- select
- i_id,
- max(i_up_id) as i_up_id,
- max(i_title_len) as i_title_len,
- max(i_play_len) as i_play_len,
- max(i_days_since_upload) as i_days_since_upload,
- max(i_1day_exp_cnt) as i_1day_exp_cnt,
- max(i_1day_click_cnt) as i_1day_click_cnt,
- max(i_1day_share_cnt) as i_1day_share_cnt,
- max(i_1day_return_cnt) as i_1day_return_cnt,
- max(i_3day_exp_cnt) as i_3day_exp_cnt,
- max(i_3day_click_cnt) as i_3day_click_cnt,
- max(i_3day_share_cnt) as i_3day_share_cnt,
- max(i_3day_return_cnt) as i_3day_return_cnt,
- max(i_7day_exp_cnt) as i_7day_exp_cnt,
- max(i_7day_click_cnt) as i_7day_click_cnt,
- max(i_7day_share_cnt) as i_7day_share_cnt,
- max(i_7day_return_cnt) as i_7day_return_cnt,
- max(i_3month_exp_cnt) as i_3month_exp_cnt,
- max(i_3month_click_cnt) as i_3month_click_cnt,
- max(i_3month_share_cnt) as i_3month_share_cnt,
- max(i_3month_return_cnt) as i_3month_return_cnt,
- max(i_ctr_1day) as i_ctr_1day,
- max(i_str_1day) as i_str_1day,
- max(i_rov_1day) as i_rov_1day,
- max(i_ros_1day) as i_ros_1day,
- max(i_ctr_3day) as i_ctr_3day,
- max(i_str_3day) as i_str_3day,
- max(i_rov_3day) as i_rov_3day,
- max(i_ros_3day) as i_ros_3day,
- max(i_ctr_7day) as i_ctr_7day,
- max(i_str_7day) as i_str_7day,
- max(i_rov_7day) as i_rov_7day,
- max(i_ros_7day) as i_ros_7day,
- max(i_ctr_3month) as i_ctr_3month,
- max(i_str_3month) as i_str_3month,
- max(i_rov_3month) as i_rov_3month,
- max(i_ros_3month) as i_ros_3month
- FROM
- candidate
- group by i_id
- )
- SELECT
- *
- from candidate_item
- """.format(datetime=datetime)
- # print(sql)
- data = exe_sql(project, sql)
- print('sql done')
- # data.to_csv('./data/ad_out_sample_v2_item.{datetime}'.format(datetime=datetime), sep='\t')
- # data = pd.read_csv('./data/ad_out_sample_v2_item.{datetime}'.format(datetime=datetime), sep='\t', dtype=str)
- model_key = 'ad_out_v2_model_v1.day'
- lr_model = LrModel('model/{}.json'.format(model_key))
- item_h_dict = {}
- k_col = 'i_id'
- dt = datetime
- key_name_prefix = f"{config_.KEY_NAME_PREFIX_AD_OUT_MODEL_SCORE_ITEM}{model_key}:"
- print(key_name_prefix)
- with data.open_reader() as reader:
- for row in reader:
- k = row['i_id']
- item_features = get_item_features(row)
- item_h = lr_model.predict_h(item_features)
- redis_helper.set_data_to_redis(f"{key_name_prefix}:{k}", item_h, 28 * 3600)
- item_h_dict[k] = item_h
- # print(item_features)
- # print(item_h)
- with open('{}.json'.format(key_name_prefix), 'w') as fout:
- json.dump(item_h_dict, fout, indent=2, ensure_ascii=False, sort_keys=True)
|