#coding utf-8 from tqdm import tqdm import sys import json import traceback from threading import Timer from tqdm import tqdm from utils import RedisHelper, data_check, get_feature_data, send_msg_to_feishu from config import set_config from log import Log config_, _ = set_config() log_ = Log() redis_helper = RedisHelper() from feature import get_item_features from lr_model import LrModel from utils import exe_sql if __name__ == "__main__": project = 'loghubods' datetime = sys.argv[1] sql = """ --odps sql --********************************************************************-- --author:研发 --create time:2023-12-01 15:48:17 --********************************************************************-- with candidate as ( select -- 基础特征_用户 mid AS u_id ,machineinfo_brand AS u_brand ,machineinfo_model AS u_device ,SPLIT(machineinfo_system,' ')[0] AS u_system ,machineinfo_system AS u_system_ver -- 基础特征_视频 ,videoid AS i_id ,i_up_id AS i_up_id ,tags as i_tag ,title as i_title ,ceil(log2(i_title_len + 1)) as i_title_len ,ceil(log2(total_time + 1)) as i_play_len ,ceil(log2(i_days_since_upload + 1)) as i_days_since_upload -- 发布时间(距离现在天数) -- 基础特征_场景 ,ctx_day AS ctx_day ,ctx_week AS ctx_week ,ctx_hour AS ctx_hour ,ctx_region as ctx_region ,ctx_city as ctx_city -- 基础特征_交叉 ,ui_is_out as ui_is_out ,i_play_len as playtime -- ,IF(i_play_len > 1,'0','1') AS ui_is_out_new ,rootmid AS ui_root_id ,shareid AS ui_share_id -- 统计特征_用户 ,ceil(log2(u_1day_exp_cnt + 1)) as u_1day_exp_cnt ,ceil(log2(u_1day_click_cnt + 1)) as u_1day_click_cnt ,ceil(log2(u_1day_share_cnt + 1)) as u_1day_share_cnt ,ceil(log2(u_1day_return_cnt + 1)) as u_1day_return_cnt ,ceil(log2(u_3day_exp_cnt + 1)) as u_3day_exp_cnt ,ceil(log2(u_3day_click_cnt + 1)) as u_3day_click_cnt ,ceil(log2(u_3day_share_cnt + 1)) as u_3day_share_cnt ,ceil(log2(u_3day_return_cnt + 1)) as u_3day_return_cnt ,ceil(log2(u_7day_exp_cnt + 1)) as u_7day_exp_cnt ,ceil(log2(u_7day_click_cnt + 1)) as u_7day_click_cnt ,ceil(log2(u_7day_share_cnt + 1)) as u_7day_share_cnt ,ceil(log2(u_7day_return_cnt + 1)) as u_7day_return_cnt ,ceil(log2(u_3month_exp_cnt + 1)) as u_3month_exp_cnt ,ceil(log2(u_3month_click_cnt + 1)) as u_3month_click_cnt ,ceil(log2(u_3month_share_cnt + 1)) as u_3month_share_cnt ,ceil(log2(u_3month_return_cnt + 1)) as u_3month_return_cnt ,round(if(u_ctr_1day > 10.0, 10.0, u_ctr_1day) / 10.0, 6) as u_ctr_1day ,round(if(u_str_1day > 10.0, 10.0, u_str_1day) / 10.0, 6) as u_str_1day ,round(if(u_rov_1day > 10.0, 10.0, u_rov_1day) / 10.0, 6) as u_rov_1day ,round(if(u_ros_1day > 10.0, 10.0, u_ros_1day) / 10.0, 6) as u_ros_1day ,round(if(u_ctr_3day > 10.0, 10.0, u_ctr_3day) / 10.0, 6) as u_ctr_3day ,round(if(u_str_3day > 10.0, 10.0, u_str_3day) / 10.0, 6) as u_str_3day ,round(if(u_rov_3day > 10.0, 10.0, u_rov_3day) / 10.0, 6) as u_rov_3day ,round(if(u_ros_3day > 10.0, 10.0, u_ros_3day) / 10.0, 6) as u_ros_3day ,round(if(u_ctr_7day > 10.0, 10.0, u_ctr_7day) / 10.0, 6) as u_ctr_7day ,round(if(u_str_7day > 10.0, 10.0, u_str_7day) / 10.0, 6) as u_str_7day ,round(if(u_rov_7day > 10.0, 10.0, u_rov_7day) / 10.0, 6) as u_rov_7day ,round(if(u_ros_7day > 10.0, 10.0, u_ros_7day) / 10.0, 6) as u_ros_7day ,round(if(u_ctr_3month > 10.0, 10.0, u_ctr_3month) / 10.0, 6) as u_ctr_3month ,round(if(u_str_3month > 10.0, 10.0, u_str_3month) / 10.0, 6) as u_str_3month ,round(if(u_rov_3month > 10.0, 10.0, u_rov_3month) / 10.0, 6) as u_rov_3month ,round(if(u_ros_3month > 10.0, 10.0, u_ros_3month) / 10.0, 6) as u_ros_3month -- 统计特征_视频 ,ceil(log2(i_1day_exp_cnt + 1)) as i_1day_exp_cnt ,ceil(log2(i_1day_click_cnt + 1)) as i_1day_click_cnt ,ceil(log2(i_1day_share_cnt + 1)) as i_1day_share_cnt ,ceil(log2(i_1day_return_cnt + 1)) as i_1day_return_cnt ,ceil(log2(i_3day_exp_cnt + 1)) as i_3day_exp_cnt ,ceil(log2(i_3day_click_cnt + 1)) as i_3day_click_cnt ,ceil(log2(i_3day_share_cnt + 1)) as i_3day_share_cnt ,ceil(log2(i_3day_return_cnt + 1)) as i_3day_return_cnt ,ceil(log2(i_7day_exp_cnt + 1)) as i_7day_exp_cnt ,ceil(log2(i_7day_click_cnt + 1)) as i_7day_click_cnt ,ceil(log2(i_7day_share_cnt + 1)) as i_7day_share_cnt ,ceil(log2(i_7day_return_cnt + 1)) as i_7day_return_cnt ,ceil(log2(i_3month_exp_cnt + 1)) as i_3month_exp_cnt ,ceil(log2(i_3month_click_cnt + 1)) as i_3month_click_cnt ,ceil(log2(i_3month_share_cnt + 1)) as i_3month_share_cnt ,ceil(log2(i_3month_return_cnt + 1)) as i_3month_return_cnt ,round(if(i_ctr_1day > 10.0, 10.0, i_ctr_1day) / 10.0, 6) as i_ctr_1day ,round(if(i_str_1day > 10.0, 10.0, i_str_1day) / 10.0, 6) as i_str_1day ,round(if(i_rov_1day > 10.0, 10.0, i_rov_1day) / 10.0, 6) as i_rov_1day ,round(if(i_ros_1day > 10.0, 10.0, i_ros_1day) / 10.0, 6) as i_ros_1day ,round(if(i_ctr_3day > 10.0, 10.0, i_ctr_3day) / 10.0, 6) as i_ctr_3day ,round(if(i_str_3day > 10.0, 10.0, i_str_3day) / 10.0, 6) as i_str_3day ,round(if(i_rov_3day > 10.0, 10.0, i_rov_3day) / 10.0, 6) as i_rov_3day ,round(if(i_ros_3day > 10.0, 10.0, i_ros_3day) / 10.0, 6) as i_ros_3day ,round(if(i_ctr_7day > 10.0, 10.0, i_ctr_7day) / 10.0, 6) as i_ctr_7day ,round(if(i_str_7day > 10.0, 10.0, i_str_7day) / 10.0, 6) as i_str_7day ,round(if(i_rov_7day > 10.0, 10.0, i_rov_7day) / 10.0, 6) as i_rov_7day ,round(if(i_ros_7day > 10.0, 10.0, i_ros_7day) / 10.0, 6) as i_ros_7day ,round(if(i_ctr_3month > 10.0, 10.0, i_ctr_3month) / 10.0, 6) as i_ctr_3month ,round(if(i_str_3month > 10.0, 10.0, i_str_3month) / 10.0, 6) as i_str_3month ,round(if(i_rov_3month > 10.0, 10.0, i_rov_3month) / 10.0, 6) as i_rov_3month ,round(if(i_ros_3month > 10.0, 10.0, i_ros_3month) / 10.0, 6) as i_ros_3month from user_video_features_data_final where dt='{datetime}' and ad_ornot = '0' and apptype != '13' ), candidate_user as ( SELECT u_id, max(u_1day_exp_cnt) as u_1day_exp_cnt, max(u_1day_click_cnt) as u_1day_click_cnt, max(u_1day_share_cnt) as u_1day_share_cnt, max(u_1day_return_cnt) as u_1day_return_cnt, max(u_3day_exp_cnt) as u_3day_exp_cnt, max(u_3day_click_cnt) as u_3day_click_cnt, max(u_3day_share_cnt) as u_3day_share_cnt, max(u_3day_return_cnt) as u_3day_return_cnt, max(u_7day_exp_cnt) as u_7day_exp_cnt, max(u_7day_click_cnt) as u_7day_click_cnt, max(u_7day_share_cnt) as u_7day_share_cnt, max(u_7day_return_cnt) as u_7day_return_cnt, max(u_3month_exp_cnt) as u_3month_exp_cnt, max(u_3month_click_cnt) as u_3month_click_cnt, max(u_3month_share_cnt) as u_3month_share_cnt, max(u_3month_return_cnt) as u_3month_return_cnt, max(u_ctr_1day) as u_ctr_1day, max(u_str_1day) as u_str_1day, max(u_rov_1day) as u_rov_1day, max(u_ros_1day) as u_ros_1day, max(u_ctr_3day) as u_ctr_3day, max(u_str_3day) as u_str_3day, max(u_rov_3day) as u_rov_3day, max(u_ros_3day) as u_ros_3day, max(u_ctr_7day) as u_ctr_7day, max(u_str_7day) as u_str_7day, max(u_rov_7day) as u_rov_7day, max(u_ros_7day) as u_ros_7day, max(u_ctr_3month) as u_ctr_3month, max(u_str_3month) as u_str_3month, max(u_rov_3month) as u_rov_3month, max(u_ros_3month) as u_ros_3month FROM candidate group by u_id ), candidate_item as ( select i_id, max(i_up_id) as i_up_id, max(i_title_len) as i_title_len, max(i_play_len) as i_play_len, max(i_days_since_upload) as i_days_since_upload, max(i_1day_exp_cnt) as i_1day_exp_cnt, max(i_1day_click_cnt) as i_1day_click_cnt, max(i_1day_share_cnt) as i_1day_share_cnt, max(i_1day_return_cnt) as i_1day_return_cnt, max(i_3day_exp_cnt) as i_3day_exp_cnt, max(i_3day_click_cnt) as i_3day_click_cnt, max(i_3day_share_cnt) as i_3day_share_cnt, max(i_3day_return_cnt) as i_3day_return_cnt, max(i_7day_exp_cnt) as i_7day_exp_cnt, max(i_7day_click_cnt) as i_7day_click_cnt, max(i_7day_share_cnt) as i_7day_share_cnt, max(i_7day_return_cnt) as i_7day_return_cnt, max(i_3month_exp_cnt) as i_3month_exp_cnt, max(i_3month_click_cnt) as i_3month_click_cnt, max(i_3month_share_cnt) as i_3month_share_cnt, max(i_3month_return_cnt) as i_3month_return_cnt, max(i_ctr_1day) as i_ctr_1day, max(i_str_1day) as i_str_1day, max(i_rov_1day) as i_rov_1day, max(i_ros_1day) as i_ros_1day, max(i_ctr_3day) as i_ctr_3day, max(i_str_3day) as i_str_3day, max(i_rov_3day) as i_rov_3day, max(i_ros_3day) as i_ros_3day, max(i_ctr_7day) as i_ctr_7day, max(i_str_7day) as i_str_7day, max(i_rov_7day) as i_rov_7day, max(i_ros_7day) as i_ros_7day, max(i_ctr_3month) as i_ctr_3month, max(i_str_3month) as i_str_3month, max(i_rov_3month) as i_rov_3month, max(i_ros_3month) as i_ros_3month FROM candidate group by i_id ) SELECT * from candidate_item """.format(datetime=datetime) # print(sql) data = exe_sql(project, sql) print('sql done') # data.to_csv('./data/ad_out_sample_v2_item.{datetime}'.format(datetime=datetime), sep='\t') # data = pd.read_csv('./data/ad_out_sample_v2_item.{datetime}'.format(datetime=datetime), sep='\t', dtype=str) model_key = 'ad_out_v2_model_v1.day' lr_model = LrModel('model/{}.json'.format(model_key)) item_h_dict = {} k_col = 'i_id' dt = datetime key_name_prefix = f"{config_.KEY_NAME_PREFIX_AD_OUT_MODEL_SCORE_ITEM}{model_key}" print(key_name_prefix) mean_item_h = 0.0 count_item_h = 0 with data.open_reader() as reader: for row in tqdm(reader): k = str(row['i_id']) item_features = get_item_features(row) item_h = lr_model.predict_h(item_features) redis_helper.set_data_to_redis(f"{key_name_prefix}:{k}", item_h, 28 * 3600) item_h_dict[k] = item_h mean_item_h += item_h count_item_h += 1 # print(item_features) # print(item_h) mean_item_h = mean_item_h / count_item_h item_h_dict['mean'] = mean_item_h print(mean_item_h) print(count_item_h) k = 'mean' redis_helper.set_data_to_redis(f"{key_name_prefix}:{k}", mean_item_h, 28 * 3600) with open('{}.json'.format(key_name_prefix), 'w') as fout: json.dump(item_h_dict, fout, indent=2, ensure_ascii=False, sort_keys=True)