Browse Source

Merge branch 'master' into feature/zhangbo_flow_recall

zhangbo 1 year ago
parent
commit
d03ab1559e

+ 1 - 0
.gitignore

@@ -65,3 +65,4 @@ data/
 *.pickle
 
 .ipynb_checkpoints/
+nohup.out

+ 59 - 16
ad_out_v1_get_offline_score_item.py

@@ -1,8 +1,7 @@
 #coding utf-8
-from tqdm import tqdm
 import sys
 import json
-
+import datetime
 import traceback
 from threading import Timer
 from tqdm import tqdm
@@ -17,10 +16,8 @@ from feature import get_item_features
 from lr_model import LrModel
 from utils import exe_sql
 
-
-if __name__ == "__main__":
+def update_offline_score_item(dt):
     project = 'loghubods'
-    datetime = sys.argv[1]
     sql = """
 --odps sql 
 --********************************************************************--
@@ -127,7 +124,7 @@ mid AS u_id
 ,round(if(i_ros_3month > 10.0, 10.0, i_ros_3month) / 10.0, 6) as i_ros_3month
 from
 loghubods.user_video_features_data_final
-where dt='{datetime}'
+where dt='{dt}'
 and ad_ornot = '0'
 and apptype != '13'
 ), candidate_user as (
@@ -223,8 +220,8 @@ and apptype != '13'
 SELECT
 *
 from candidate_item
-    """.format(datetime=datetime)
-    # print(sql)
+    """.format(dt=dt)
+    # log_.info(sql)
     data = exe_sql(project, sql)
     print('sql done')
     # data.to_csv('./data/ad_out_sample_v2_item.{datetime}'.format(datetime=datetime), sep='\t')
@@ -232,8 +229,6 @@ from candidate_item
     model_key = 'ad_out_v1'
     lr_model = LrModel('model/{}.json'.format(model_key))
     item_h_dict = {}
-    k_col = 'i_id'
-    dt = datetime
     key_name_prefix = f"{config_.KEY_NAME_PREFIX_AD_OUT_MODEL_SCORE_ITEM}{model_key}"
     print(key_name_prefix)
     mean_item_h = 0.0
@@ -245,18 +240,66 @@ from candidate_item
             k = str(row['i_id'])
             item_features = get_item_features(row)
             item_h = lr_model.predict_h(item_features)
-            redis_helper.set_data_to_redis(f"{key_name_prefix}:{k}", item_h, expire_time)
+            #redis_helper.set_data_to_redis(f"{key_name_prefix}:{k}", item_h, expire_time)
             item_h_dict[k] = item_h
             mean_item_h += item_h
             count_item_h += 1
             # print(item_features)
             # print(item_h)
-    mean_item_h = mean_item_h / count_item_h 
-    item_h_dict['mean'] = mean_item_h 
+    mean_item_h = mean_item_h / count_item_h
+    item_h_dict['mean'] = mean_item_h
     print(mean_item_h)
     print(count_item_h)
     k = 'mean'
-    redis_helper.set_data_to_redis(f"{key_name_prefix}:{k}", mean_item_h, expire_time)
-    # with open('{}.json'.format(key_name_prefix), 'w') as fout:
-    #     json.dump(item_h_dict, fout, indent=2, ensure_ascii=False, sort_keys=True)
+    #redis_helper.set_data_to_redis(f"{key_name_prefix}:{k}", mean_item_h, expire_time)
+    with open('{}.{}.json'.format(key_name_prefix, dt), 'w') as fout:
+         json.dump(item_h_dict, fout, indent=2, ensure_ascii=False, sort_keys=True)
+
+
+def timer_check(dt):
+    try:
+        project = config_.ad_model_data['ad_out_v1'].get('project')
+        table = config_.ad_model_data['ad_out_v1'].get('table')
+        now_date = datetime.datetime.today()
+        yesterday_date = now_date - datetime.timedelta(days=1)
+        now_dt = datetime.datetime.strftime(now_date, '%Y%m%d')
+        yesterday_dt = datetime.datetime.strftime(yesterday_date, '%Y%m%d')
+        log_.info(f"now_dt: {now_dt}")
+        if dt is not None:
+            yesterday_dt = dt
+        log_.info(f"update_dt: {yesterday_dt}")
+        now_min = datetime.datetime.now().minute
+        # 查看当前更新的数据是否已准备好
+        data_count = data_check(project=project, table=table, dt=yesterday_dt)
+        if data_count > 0:
+            log_.info('update_offline_score_item start! {}'.format(data_count))
+            # 数据准备好,进行更新
+            update_offline_score_item(dt=yesterday_dt)
+            log_.info('update_offline_score_item end!')
+        else:
+            # 数据没准备好,5分钟后重新检查
+            wait_seconds = 5 * 60
+            log_.info('data not ready, wait {}s'.format(wait_seconds))
+            Timer(wait_seconds, timer_check).start()
+
+    except Exception as e:
+        log_.error(f"用户广告跳出率预估离线item数据更新失败 exception: {e}, traceback: {traceback.format_exc()}")
+        send_msg_to_feishu(
+            webhook=config_.FEISHU_ROBOT['server_robot'].get('webhook'),
+            key_word=config_.FEISHU_ROBOT['server_robot'].get('key_word'),
+            msg_text=f"rov-offline{config_.ENV_TEXT} - 用户广告跳出率预估离线item数据更新失败\n"
+                     f"exception: {e}\n"
+                     f"traceback: {traceback.format_exc()}"
+        )
+
+
+if __name__ == "__main__":
+    dt = None
+    if len(sys.argv) > 1:
+        dt = sys.argv[1]
+        log_.info('## 手动更新:{}'.format(dt))
+    else:
+        log_.info('## 自动更新')
+    timer_check(dt)
+
 

+ 9 - 2
ad_out_v1_get_offline_score_item.sh

@@ -1,3 +1,10 @@
-
-/root/anaconda3/bin/python ad_out_v1_get_offline_score_item.py $*
+source /etc/profile
+echo $ROV_OFFLINE_ENV
+if [[ $ROV_OFFLINE_ENV == 'test' ]]; then
+    cd /data2/rov-offline &&
+    /root/anaconda3/bin/python /data2/rov-offline/ad_out_v1_get_offline_score_item_new.py $*
+elif [[ $ROV_OFFLINE_ENV == 'pro' ]]; then
+    cd /data/rov-offline &&
+    /root/anaconda3/bin/python /data/rov-offline/ad_out_v1_get_offline_score_item_new.py $*
+fi
 

+ 17 - 0
ad_out_v1_get_offline_score_item_batch.sh

@@ -0,0 +1,17 @@
+source /etc/profile
+echo $ROV_OFFLINE_ENV
+d=2023-12-01
+while [ "$d" != 2023-12-12 ]; do 
+  datetime=$(date -d "$d" +%Y%m%d)
+  echo $datetime
+    if [[ $ROV_OFFLINE_ENV == 'test' ]]; then
+        cd /data2/rov-offline &&
+        /root/anaconda3/bin/python /data2/rov-offline/ad_out_v1_get_offline_score_item_new.py $datetime
+    elif [[ $ROV_OFFLINE_ENV == 'pro' ]]; then
+        cd /data/rov-offline &&
+        /root/anaconda3/bin/python /data/rov-offline/ad_out_v1_get_offline_score_item_new.py $datetime
+    fi
+  d=$(date -I -d "$d + 1 day")
+done
+echo "done"
+

+ 291 - 0
ad_out_v1_get_offline_score_item_new.py

@@ -0,0 +1,291 @@
+#coding utf-8
+import sys
+import datetime
+import traceback
+from threading import Timer
+from tqdm import tqdm
+from utils import RedisHelper, data_check, get_feature_data, send_msg_to_feishu
+from config import set_config
+from log import Log
+from records_process import records_process
+
+config_, _ = set_config()
+log_ = Log()
+redis_helper = RedisHelper()
+
+from feature import get_item_features as get_features
+from lr_model import LrModel
+from utils import exe_sql
+
+model_key = 'ad_out_v1'
+lr_model = LrModel('model/{}.json'.format(model_key))
+item_h_dict = {}
+key_name_prefix = f"{config_.KEY_NAME_PREFIX_AD_OUT_MODEL_SCORE_ITEM}{model_key}"
+print(key_name_prefix)
+# 过期时间:一周
+expire_time = 7 * 24 * 3600
+
+def process_and_store(row):
+    k = str(row['k'])
+    features = get_features(row)
+    h = lr_model.predict_h(features)
+    redis_helper.set_data_to_redis(f"{key_name_prefix}:{k}", round(h, 6), expire_time)
+
+def update_offline_score_item(dt):
+    project = 'loghubods'
+    sql = """
+--odps sql 
+--********************************************************************--
+--author:研发
+--create time:2023-12-01 15:48:17
+--********************************************************************--
+with candidate as (
+select
+-- 基础特征_用户
+mid AS u_id
+,machineinfo_brand AS u_brand
+,machineinfo_model AS u_device
+,SPLIT(machineinfo_system,' ')[0] AS u_system
+,machineinfo_system AS u_system_ver
+-- 基础特征_视频
+,videoid AS i_id
+,i_up_id AS i_up_id
+,tags as i_tag
+,title as i_title
+,ceil(log2(i_title_len + 1)) as i_title_len
+,ceil(log2(total_time + 1)) as i_play_len
+,ceil(log2(i_days_since_upload + 1)) as i_days_since_upload -- 发布时间(距离现在天数)
+-- 基础特征_场景
+,apptype AS ctx_apptype
+,ctx_day AS ctx_day
+,ctx_week AS ctx_week
+,ctx_hour AS ctx_hour
+,ctx_region as ctx_region
+,ctx_city as ctx_city
+-- 基础特征_交叉
+,ui_is_out as ui_is_out
+,i_play_len as playtime
+-- ,IF(i_play_len > 1,'0','1') AS ui_is_out_new
+,rootmid AS ui_root_id
+,shareid AS ui_share_id
+-- 统计特征_用户
+,u_cycle_bucket_7days
+,u_cycle_bucket_30days
+,u_share_bucket_30days
+,ceil(log2(u_1day_exp_cnt + 1)) as u_1day_exp_cnt
+,ceil(log2(u_1day_click_cnt + 1)) as u_1day_click_cnt
+,ceil(log2(u_1day_share_cnt + 1)) as u_1day_share_cnt
+,ceil(log2(u_1day_return_cnt + 1)) as u_1day_return_cnt
+,ceil(log2(u_3day_exp_cnt + 1)) as u_3day_exp_cnt
+,ceil(log2(u_3day_click_cnt + 1)) as u_3day_click_cnt
+,ceil(log2(u_3day_share_cnt + 1)) as u_3day_share_cnt
+,ceil(log2(u_3day_return_cnt + 1)) as u_3day_return_cnt
+,ceil(log2(u_7day_exp_cnt + 1)) as u_7day_exp_cnt
+,ceil(log2(u_7day_click_cnt + 1)) as u_7day_click_cnt
+,ceil(log2(u_7day_share_cnt + 1)) as u_7day_share_cnt
+,ceil(log2(u_7day_return_cnt + 1)) as u_7day_return_cnt
+,ceil(log2(u_3month_exp_cnt + 1)) as u_3month_exp_cnt
+,ceil(log2(u_3month_click_cnt + 1)) as u_3month_click_cnt
+,ceil(log2(u_3month_share_cnt + 1)) as u_3month_share_cnt
+,ceil(log2(u_3month_return_cnt + 1)) as u_3month_return_cnt
+,round(if(u_ctr_1day > 10.0, 10.0, u_ctr_1day) / 10.0, 6) as u_ctr_1day
+,round(if(u_str_1day > 10.0, 10.0, u_str_1day) / 10.0, 6) as u_str_1day
+,round(if(u_rov_1day > 10.0, 10.0, u_rov_1day) / 10.0, 6) as u_rov_1day
+,round(if(u_ros_1day > 10.0, 10.0, u_ros_1day) / 10.0, 6) as u_ros_1day
+,round(if(u_ctr_3day > 10.0, 10.0, u_ctr_3day) / 10.0, 6) as u_ctr_3day
+,round(if(u_str_3day > 10.0, 10.0, u_str_3day) / 10.0, 6) as u_str_3day
+,round(if(u_rov_3day > 10.0, 10.0, u_rov_3day) / 10.0, 6) as u_rov_3day
+,round(if(u_ros_3day > 10.0, 10.0, u_ros_3day) / 10.0, 6) as u_ros_3day
+,round(if(u_ctr_7day > 10.0, 10.0, u_ctr_7day) / 10.0, 6) as u_ctr_7day
+,round(if(u_str_7day > 10.0, 10.0, u_str_7day) / 10.0, 6) as u_str_7day
+,round(if(u_rov_7day > 10.0, 10.0, u_rov_7day) / 10.0, 6) as u_rov_7day
+,round(if(u_ros_7day > 10.0, 10.0, u_ros_7day) / 10.0, 6) as u_ros_7day
+,round(if(u_ctr_3month > 10.0, 10.0, u_ctr_3month) / 10.0, 6) as u_ctr_3month
+,round(if(u_str_3month > 10.0, 10.0, u_str_3month) / 10.0, 6) as u_str_3month
+,round(if(u_rov_3month > 10.0, 10.0, u_rov_3month) / 10.0, 6) as u_rov_3month
+,round(if(u_ros_3month > 10.0, 10.0, u_ros_3month) / 10.0, 6) as u_ros_3month
+-- 统计特征_视频
+,ceil(log2(i_1day_exp_cnt + 1)) as i_1day_exp_cnt
+,ceil(log2(i_1day_click_cnt + 1)) as i_1day_click_cnt
+,ceil(log2(i_1day_share_cnt + 1)) as i_1day_share_cnt
+,ceil(log2(i_1day_return_cnt + 1)) as i_1day_return_cnt
+,ceil(log2(i_3day_exp_cnt + 1)) as i_3day_exp_cnt
+,ceil(log2(i_3day_click_cnt + 1)) as i_3day_click_cnt
+,ceil(log2(i_3day_share_cnt + 1)) as i_3day_share_cnt
+,ceil(log2(i_3day_return_cnt + 1)) as i_3day_return_cnt
+,ceil(log2(i_7day_exp_cnt + 1)) as i_7day_exp_cnt
+,ceil(log2(i_7day_click_cnt + 1)) as i_7day_click_cnt
+,ceil(log2(i_7day_share_cnt + 1)) as i_7day_share_cnt
+,ceil(log2(i_7day_return_cnt + 1)) as i_7day_return_cnt
+,ceil(log2(i_3month_exp_cnt + 1)) as i_3month_exp_cnt
+,ceil(log2(i_3month_click_cnt + 1)) as i_3month_click_cnt
+,ceil(log2(i_3month_share_cnt + 1)) as i_3month_share_cnt
+,ceil(log2(i_3month_return_cnt + 1)) as i_3month_return_cnt
+,round(if(i_ctr_1day > 10.0, 10.0, i_ctr_1day) / 10.0, 6) as i_ctr_1day
+,round(if(i_str_1day > 10.0, 10.0, i_str_1day) / 10.0, 6) as i_str_1day
+,round(if(i_rov_1day > 10.0, 10.0, i_rov_1day) / 10.0, 6) as i_rov_1day
+,round(if(i_ros_1day > 10.0, 10.0, i_ros_1day) / 10.0, 6) as i_ros_1day
+,round(if(i_ctr_3day > 10.0, 10.0, i_ctr_3day) / 10.0, 6) as i_ctr_3day
+,round(if(i_str_3day > 10.0, 10.0, i_str_3day) / 10.0, 6) as i_str_3day
+,round(if(i_rov_3day > 10.0, 10.0, i_rov_3day) / 10.0, 6) as i_rov_3day
+,round(if(i_ros_3day > 10.0, 10.0, i_ros_3day) / 10.0, 6) as i_ros_3day
+,round(if(i_ctr_7day > 10.0, 10.0, i_ctr_7day) / 10.0, 6) as i_ctr_7day
+,round(if(i_str_7day > 10.0, 10.0, i_str_7day) / 10.0, 6) as i_str_7day
+,round(if(i_rov_7day > 10.0, 10.0, i_rov_7day) / 10.0, 6) as i_rov_7day
+,round(if(i_ros_7day > 10.0, 10.0, i_ros_7day) / 10.0, 6) as i_ros_7day
+,round(if(i_ctr_3month > 10.0, 10.0, i_ctr_3month) / 10.0, 6) as i_ctr_3month
+,round(if(i_str_3month > 10.0, 10.0, i_str_3month) / 10.0, 6) as i_str_3month
+,round(if(i_rov_3month > 10.0, 10.0, i_rov_3month) / 10.0, 6) as i_rov_3month
+,round(if(i_ros_3month > 10.0, 10.0, i_ros_3month) / 10.0, 6) as i_ros_3month
+from
+loghubods.user_video_features_data_final
+where dt='{dt}'
+and ad_ornot = '0'
+and apptype != '13'
+), candidate_user as (
+    SELECT 
+    u_id,
+    max(u_brand) as u_brand,
+    max(u_device) as u_device,
+    max(u_system) as u_system,
+    max(u_system_ver) as u_system_ver,
+    max(ctx_region) as ctx_region,
+    max(ctx_city) as ctx_city,
+    max(u_cycle_bucket_7days) as u_cycle_bucket_7days,
+    max(u_cycle_bucket_30days) as u_cycle_bucket_30days,
+    max(u_share_bucket_30days) as u_share_bucket_30days,
+    max(u_1day_exp_cnt) as u_1day_exp_cnt,
+    max(u_1day_click_cnt) as u_1day_click_cnt,
+    max(u_1day_share_cnt) as u_1day_share_cnt,
+    max(u_1day_return_cnt) as u_1day_return_cnt,
+    max(u_3day_exp_cnt) as u_3day_exp_cnt,
+    max(u_3day_click_cnt) as u_3day_click_cnt,
+    max(u_3day_share_cnt) as u_3day_share_cnt,
+    max(u_3day_return_cnt) as u_3day_return_cnt,
+    max(u_7day_exp_cnt) as u_7day_exp_cnt,
+    max(u_7day_click_cnt) as u_7day_click_cnt,
+    max(u_7day_share_cnt) as u_7day_share_cnt,
+    max(u_7day_return_cnt) as u_7day_return_cnt,
+    max(u_3month_exp_cnt) as u_3month_exp_cnt,
+    max(u_3month_click_cnt) as u_3month_click_cnt,
+    max(u_3month_share_cnt) as u_3month_share_cnt,
+    max(u_3month_return_cnt) as u_3month_return_cnt,
+    max(u_ctr_1day) as u_ctr_1day,
+    max(u_str_1day) as u_str_1day,
+    max(u_rov_1day) as u_rov_1day,
+    max(u_ros_1day) as u_ros_1day,
+    max(u_ctr_3day) as u_ctr_3day,
+    max(u_str_3day) as u_str_3day,
+    max(u_rov_3day) as u_rov_3day,
+    max(u_ros_3day) as u_ros_3day,
+    max(u_ctr_7day) as u_ctr_7day,
+    max(u_str_7day) as u_str_7day,
+    max(u_rov_7day) as u_rov_7day,
+    max(u_ros_7day) as u_ros_7day,
+    max(u_ctr_3month) as u_ctr_3month,
+    max(u_str_3month) as u_str_3month,
+    max(u_rov_3month) as u_rov_3month,
+    max(u_ros_3month) as u_ros_3month
+    FROM 
+    candidate
+    group by u_id
+), candidate_item as (
+    select
+    i_id,
+    max(i_up_id) as i_up_id,
+    max(i_title_len) as i_title_len,
+    max(i_play_len) as i_play_len,
+    max(i_days_since_upload) as i_days_since_upload,
+    max(i_1day_exp_cnt) as i_1day_exp_cnt,
+    max(i_1day_click_cnt) as i_1day_click_cnt,
+    max(i_1day_share_cnt) as i_1day_share_cnt,
+    max(i_1day_return_cnt) as i_1day_return_cnt,
+    max(i_3day_exp_cnt) as i_3day_exp_cnt,
+    max(i_3day_click_cnt) as i_3day_click_cnt,
+    max(i_3day_share_cnt) as i_3day_share_cnt,
+    max(i_3day_return_cnt) as i_3day_return_cnt,
+    max(i_7day_exp_cnt) as i_7day_exp_cnt,
+    max(i_7day_click_cnt) as i_7day_click_cnt,
+    max(i_7day_share_cnt) as i_7day_share_cnt,
+    max(i_7day_return_cnt) as i_7day_return_cnt,
+    max(i_3month_exp_cnt) as i_3month_exp_cnt,
+    max(i_3month_click_cnt) as i_3month_click_cnt,
+    max(i_3month_share_cnt) as i_3month_share_cnt,
+    max(i_3month_return_cnt) as i_3month_return_cnt,
+    max(i_ctr_1day) as i_ctr_1day,
+    max(i_str_1day) as i_str_1day,
+    max(i_rov_1day) as i_rov_1day,
+    max(i_ros_1day) as i_ros_1day,
+    max(i_ctr_3day) as i_ctr_3day,
+    max(i_str_3day) as i_str_3day,
+    max(i_rov_3day) as i_rov_3day,
+    max(i_ros_3day) as i_ros_3day,
+    max(i_ctr_7day) as i_ctr_7day,
+    max(i_str_7day) as i_str_7day,
+    max(i_rov_7day) as i_rov_7day,
+    max(i_ros_7day) as i_ros_7day,
+    max(i_ctr_3month) as i_ctr_3month,
+    max(i_str_3month) as i_str_3month,
+    max(i_rov_3month) as i_rov_3month,
+    max(i_ros_3month) as i_ros_3month
+    FROM 
+    candidate
+    group by i_id
+)
+SELECT
+i_id as k,
+*
+from candidate_item
+    """.format(dt=dt)
+    # log_.info(sql)
+    records = exe_sql(project, sql)
+    log_.info('sql_done')
+    records_process(records, process_and_store, max_size=50, num_workers=10)
+
+def timer_check(dt):
+    try:
+        project = config_.ad_model_data['ad_out_v1'].get('project')
+        table = config_.ad_model_data['ad_out_v1'].get('table')
+        now_date = datetime.datetime.today()
+        yesterday_date = now_date - datetime.timedelta(days=1)
+        now_dt = datetime.datetime.strftime(now_date, '%Y%m%d')
+        yesterday_dt = datetime.datetime.strftime(yesterday_date, '%Y%m%d')
+        log_.info(f"now_dt: {now_dt}")
+        if dt is not None:
+            yesterday_dt = dt
+        log_.info(f"update_dt: {yesterday_dt}")
+        now_min = datetime.datetime.now().minute
+        # 查看当前更新的数据是否已准备好
+        data_count = data_check(project=project, table=table, dt=yesterday_dt)
+        if data_count > 0:
+            log_.info('update_offline_score_item start! {}'.format(data_count))
+            # 数据准备好,进行更新
+            update_offline_score_item(dt=yesterday_dt)
+            log_.info('update_offline_score_item end!')
+        else:
+            # 数据没准备好,5分钟后重新检查
+            wait_seconds = 5 * 60
+            log_.info('data not ready, wait {}s'.format(wait_seconds))
+            Timer(wait_seconds, timer_check, args=(dt,)).start()
+
+    except Exception as e:
+        log_.error(f"用户广告跳出率预估离线item数据更新失败 exception: {e}, traceback: {traceback.format_exc()}")
+        send_msg_to_feishu(
+            webhook=config_.FEISHU_ROBOT['server_robot'].get('webhook'),
+            key_word=config_.FEISHU_ROBOT['server_robot'].get('key_word'),
+            msg_text=f"rov-offline{config_.ENV_TEXT} - 用户广告跳出率预估离线item数据更新失败\n"
+                     f"exception: {e}\n"
+                     f"traceback: {traceback.format_exc()}"
+        )
+
+
+if __name__ == "__main__":
+    dt = None
+    if len(sys.argv) > 1:
+        dt = sys.argv[1]
+        log_.info('## 手动更新:{}'.format(dt))
+    else:
+        log_.info('## 自动更新')
+    timer_check(dt)
+
+

+ 160 - 0
ad_out_v1_get_offline_score_item_v2.py

@@ -0,0 +1,160 @@
+#coding utf-8
+import sys
+import datetime
+import traceback
+from threading import Timer
+from tqdm import tqdm
+from utils import RedisHelper, data_check, get_feature_data, send_msg_to_feishu
+from config import set_config
+from log import Log
+from records_process import records_process
+
+config_, _ = set_config()
+log_ = Log()
+redis_helper = RedisHelper()
+
+from feature import get_item_features as get_features
+from lr_model import LrModel
+from utils import exe_sql
+
+model_key = 'ad_out_v1'
+lr_model = LrModel('model/{}.json'.format(model_key))
+item_h_dict = {}
+key_name_prefix = f"{config_.KEY_NAME_PREFIX_AD_OUT_MODEL_SCORE_ITEM}{model_key}"
+print(key_name_prefix)
+# 过期时间:一周
+expire_time = 7 * 24 * 3600
+
+def process_and_store(row):
+    k = str(row['k'])
+    features = get_features(row)
+    h = lr_model.predict_h(features)
+    redis_helper.set_data_to_redis(f"{key_name_prefix}:{k}", round(h, 6), expire_time)
+
+def update_offline_score_item(dt):
+    project = 'loghubods'
+    sql = """
+--odps sql 
+--********************************************************************--
+--author:研发
+--create time:2023-12-11 23:54:20
+--********************************************************************--
+with candidate_item as (
+select
+-- 基础特征_视频
+videoid AS i_id
+,uid AS i_up_id
+-- ,tags as i_tag
+-- ,title as i_title
+,ceil(log2(length(title) + 1)) as i_title_len
+,ceil(log2(total_time + 1)) as i_play_len
+,ceil(log2(existence_days + 1)) as i_days_since_upload -- 发布时间(距离现在天数)
+-- 基础特征_场景
+-- ,apptype AS ctx_apptype
+-- ,ctx_day AS ctx_day
+-- ,ctx_week AS ctx_week
+-- ,ctx_hour AS ctx_hour
+-- ,ctx_region as ctx_region
+-- ,ctx_city as ctx_city
+-- 基础特征_交叉
+-- ,ui_is_out as ui_is_out
+-- ,i_play_len as playtime
+-- ,IF(i_play_len > 1,'0','1') AS ui_is_out_new
+-- ,rootmid AS ui_root_id
+-- ,shareid AS ui_share_id
+-- 统计特征_视频
+,ceil(log2(i_1day_exp_cnt + 1)) as i_1day_exp_cnt
+,ceil(log2(i_1day_click_cnt + 1)) as i_1day_click_cnt
+,ceil(log2(i_1day_share_cnt + 1)) as i_1day_share_cnt
+,ceil(log2(i_1day_return_cnt + 1)) as i_1day_return_cnt
+,ceil(log2(i_3day_exp_cnt + 1)) as i_3day_exp_cnt
+,ceil(log2(i_3day_click_cnt + 1)) as i_3day_click_cnt
+,ceil(log2(i_3day_share_cnt + 1)) as i_3day_share_cnt
+,ceil(log2(i_3day_return_cnt + 1)) as i_3day_return_cnt
+,ceil(log2(i_7day_exp_cnt + 1)) as i_7day_exp_cnt
+,ceil(log2(i_7day_click_cnt + 1)) as i_7day_click_cnt
+,ceil(log2(i_7day_share_cnt + 1)) as i_7day_share_cnt
+,ceil(log2(i_7day_return_cnt + 1)) as i_7day_return_cnt
+,ceil(log2(i_3month_exp_cnt + 1)) as i_3month_exp_cnt
+,ceil(log2(i_3month_click_cnt + 1)) as i_3month_click_cnt
+,ceil(log2(i_3month_share_cnt + 1)) as i_3month_share_cnt
+,ceil(log2(i_3month_return_cnt + 1)) as i_3month_return_cnt
+,round(if(i_ctr_1day > 10.0, 10.0, i_ctr_1day) / 10.0, 6) as i_ctr_1day
+,round(if(i_str_1day > 10.0, 10.0, i_str_1day) / 10.0, 6) as i_str_1day
+,round(if(i_rov_1day > 10.0, 10.0, i_rov_1day) / 10.0, 6) as i_rov_1day
+,round(if(i_ros_1day > 10.0, 10.0, i_ros_1day) / 10.0, 6) as i_ros_1day
+,round(if(i_ctr_3day > 10.0, 10.0, i_ctr_3day) / 10.0, 6) as i_ctr_3day
+,round(if(i_str_3day > 10.0, 10.0, i_str_3day) / 10.0, 6) as i_str_3day
+,round(if(i_rov_3day > 10.0, 10.0, i_rov_3day) / 10.0, 6) as i_rov_3day
+,round(if(i_ros_3day > 10.0, 10.0, i_ros_3day) / 10.0, 6) as i_ros_3day
+,round(if(i_ctr_7day > 10.0, 10.0, i_ctr_7day) / 10.0, 6) as i_ctr_7day
+,round(if(i_str_7day > 10.0, 10.0, i_str_7day) / 10.0, 6) as i_str_7day
+,round(if(i_rov_7day > 10.0, 10.0, i_rov_7day) / 10.0, 6) as i_rov_7day
+,round(if(i_ros_7day > 10.0, 10.0, i_ros_7day) / 10.0, 6) as i_ros_7day
+,round(if(i_ctr_3month > 10.0, 10.0, i_ctr_3month) / 10.0, 6) as i_ctr_3month
+,round(if(i_str_3month > 10.0, 10.0, i_str_3month) / 10.0, 6) as i_str_3month
+,round(if(i_rov_3month > 10.0, 10.0, i_rov_3month) / 10.0, 6) as i_rov_3month
+,round(if(i_ros_3month > 10.0, 10.0, i_ros_3month) / 10.0, 6) as i_ros_3month
+from
+loghubods.alg_recsys_video_info
+where dt='{dt}'
+and length(videoid) > 0
+)
+SELECT
+i_id as k,
+*
+from candidate_item
+order by rand()
+    """.format(dt=dt)
+    # log_.info(sql)
+    records = exe_sql(project, sql)
+    log_.info('sql_done')
+    records_process(records, process_and_store, max_size=50, num_workers=10)
+
+def timer_check(dt):
+    try:
+        project = config_.ad_model_data['ad_out_v1_item'].get('project')
+        table = config_.ad_model_data['ad_out_v1_item'].get('table')
+        now_date = datetime.datetime.today()
+        yesterday_date = now_date - datetime.timedelta(days=1)
+        now_dt = datetime.datetime.strftime(now_date, '%Y%m%d')
+        yesterday_dt = datetime.datetime.strftime(yesterday_date, '%Y%m%d')
+        log_.info(f"now_dt: {now_dt}")
+        if dt is not None:
+            yesterday_dt = dt
+        log_.info(f"update_dt: {yesterday_dt}")
+        now_min = datetime.datetime.now().minute
+        # 查看当前更新的数据是否已准备好
+        data_count = data_check(project=project, table=table, dt=yesterday_dt)
+        if data_count > 0:
+            log_.info('update_offline_score_item start! {}'.format(data_count))
+            # 数据准备好,进行更新
+            update_offline_score_item(dt=yesterday_dt)
+            log_.info('update_offline_score_item end!')
+        else:
+            # 数据没准备好,5分钟后重新检查
+            wait_seconds = 5 * 60
+            log_.info('data not ready, wait {}s'.format(wait_seconds))
+            Timer(wait_seconds, timer_check, args=(dt,)).start()
+
+    except Exception as e:
+        log_.error(f"用户广告跳出率预估离线item数据更新失败 exception: {e}, traceback: {traceback.format_exc()}")
+        send_msg_to_feishu(
+            webhook=config_.FEISHU_ROBOT['server_robot'].get('webhook'),
+            key_word=config_.FEISHU_ROBOT['server_robot'].get('key_word'),
+            msg_text=f"rov-offline{config_.ENV_TEXT} - 用户广告跳出率预估离线item数据更新失败\n"
+                     f"exception: {e}\n"
+                     f"traceback: {traceback.format_exc()}"
+        )
+
+
+if __name__ == "__main__":
+    dt = None
+    if len(sys.argv) > 1:
+        dt = sys.argv[1]
+        log_.info('## 手动更新:{}'.format(dt))
+    else:
+        log_.info('## 自动更新')
+    timer_check(dt)
+
+

+ 10 - 0
ad_out_v1_get_offline_score_item_v2.sh

@@ -0,0 +1,10 @@
+source /etc/profile
+echo $ROV_OFFLINE_ENV
+if [[ $ROV_OFFLINE_ENV == 'test' ]]; then
+    cd /data2/rov-offline &&
+    /root/anaconda3/bin/python /data2/rov-offline/ad_out_v1_get_offline_score_item_v2.py $*
+elif [[ $ROV_OFFLINE_ENV == 'pro' ]]; then
+    cd /data/rov-offline &&
+    /root/anaconda3/bin/python /data/rov-offline/ad_out_v1_get_offline_score_item_v2.py $*
+fi
+

+ 174 - 0
ad_out_v1_get_offline_score_item_v2_debug.py

@@ -0,0 +1,174 @@
+#coding utf-8
+import sys
+import json
+import datetime
+import traceback
+from threading import Timer
+from tqdm import tqdm
+from utils import RedisHelper, data_check, get_feature_data, send_msg_to_feishu
+from config import set_config
+from log import Log
+config_, _ = set_config()
+log_ = Log()
+redis_helper = RedisHelper()
+
+from feature import get_item_features
+from lr_model import LrModel
+from utils import exe_sql
+
+def update_offline_score_item(dt):
+    project = 'loghubods'
+    sql = """
+--odps sql
+--********************************************************************--
+--author:研发
+--create time:2023-12-11 23:54:20
+--********************************************************************--
+with candidate_item as (
+select
+-- 基础特征_视频
+videoid AS i_id
+,uid AS i_up_id
+-- ,tags as i_tag
+-- ,title as i_title
+,ceil(log2(length(title) + 1)) as i_title_len
+,ceil(log2(total_time + 1)) as i_play_len
+,ceil(log2(existence_days + 1)) as i_days_since_upload -- 发布时间(距离现在天数)
+-- 基础特征_场景
+-- ,apptype AS ctx_apptype
+-- ,ctx_day AS ctx_day
+-- ,ctx_week AS ctx_week
+-- ,ctx_hour AS ctx_hour
+-- ,ctx_region as ctx_region
+-- ,ctx_city as ctx_city
+-- 基础特征_交叉
+-- ,ui_is_out as ui_is_out
+-- ,i_play_len as playtime
+-- ,IF(i_play_len > 1,'0','1') AS ui_is_out_new
+-- ,rootmid AS ui_root_id
+-- ,shareid AS ui_share_id
+-- 统计特征_视频
+,ceil(log2(i_1day_exp_cnt + 1)) as i_1day_exp_cnt
+,ceil(log2(i_1day_click_cnt + 1)) as i_1day_click_cnt
+,ceil(log2(i_1day_share_cnt + 1)) as i_1day_share_cnt
+,ceil(log2(i_1day_return_cnt + 1)) as i_1day_return_cnt
+,ceil(log2(i_3day_exp_cnt + 1)) as i_3day_exp_cnt
+,ceil(log2(i_3day_click_cnt + 1)) as i_3day_click_cnt
+,ceil(log2(i_3day_share_cnt + 1)) as i_3day_share_cnt
+,ceil(log2(i_3day_return_cnt + 1)) as i_3day_return_cnt
+,ceil(log2(i_7day_exp_cnt + 1)) as i_7day_exp_cnt
+,ceil(log2(i_7day_click_cnt + 1)) as i_7day_click_cnt
+,ceil(log2(i_7day_share_cnt + 1)) as i_7day_share_cnt
+,ceil(log2(i_7day_return_cnt + 1)) as i_7day_return_cnt
+,ceil(log2(i_3month_exp_cnt + 1)) as i_3month_exp_cnt
+,ceil(log2(i_3month_click_cnt + 1)) as i_3month_click_cnt
+,ceil(log2(i_3month_share_cnt + 1)) as i_3month_share_cnt
+,ceil(log2(i_3month_return_cnt + 1)) as i_3month_return_cnt
+,round(if(i_ctr_1day > 10.0, 10.0, i_ctr_1day) / 10.0, 6) as i_ctr_1day
+,round(if(i_str_1day > 10.0, 10.0, i_str_1day) / 10.0, 6) as i_str_1day
+,round(if(i_rov_1day > 10.0, 10.0, i_rov_1day) / 10.0, 6) as i_rov_1day
+,round(if(i_ros_1day > 10.0, 10.0, i_ros_1day) / 10.0, 6) as i_ros_1day
+,round(if(i_ctr_3day > 10.0, 10.0, i_ctr_3day) / 10.0, 6) as i_ctr_3day
+,round(if(i_str_3day > 10.0, 10.0, i_str_3day) / 10.0, 6) as i_str_3day
+,round(if(i_rov_3day > 10.0, 10.0, i_rov_3day) / 10.0, 6) as i_rov_3day
+,round(if(i_ros_3day > 10.0, 10.0, i_ros_3day) / 10.0, 6) as i_ros_3day
+,round(if(i_ctr_7day > 10.0, 10.0, i_ctr_7day) / 10.0, 6) as i_ctr_7day
+,round(if(i_str_7day > 10.0, 10.0, i_str_7day) / 10.0, 6) as i_str_7day
+,round(if(i_rov_7day > 10.0, 10.0, i_rov_7day) / 10.0, 6) as i_rov_7day
+,round(if(i_ros_7day > 10.0, 10.0, i_ros_7day) / 10.0, 6) as i_ros_7day
+,round(if(i_ctr_3month > 10.0, 10.0, i_ctr_3month) / 10.0, 6) as i_ctr_3month
+,round(if(i_str_3month > 10.0, 10.0, i_str_3month) / 10.0, 6) as i_str_3month
+,round(if(i_rov_3month > 10.0, 10.0, i_rov_3month) / 10.0, 6) as i_rov_3month
+,round(if(i_ros_3month > 10.0, 10.0, i_ros_3month) / 10.0, 6) as i_ros_3month
+from
+loghubods.alg_recsys_video_info
+where dt='{dt}'
+and length(videoid) > 0
+)
+SELECT
+i_id as k,
+*
+from candidate_item
+    """.format(dt=dt)
+    # log_.info(sql)
+    data = exe_sql(project, sql)
+    print('sql done')
+    # data.to_csv('./data/ad_out_sample_v2_item.{datetime}'.format(datetime=datetime), sep='\t')
+    # data = pd.read_csv('./data/ad_out_sample_v2_item.{datetime}'.format(datetime=datetime), sep='\t', dtype=str)
+    model_key = 'ad_out_v1'
+    lr_model = LrModel('model/{}.json'.format(model_key))
+    item_h_dict = {}
+    key_name_prefix = f"{config_.KEY_NAME_PREFIX_AD_OUT_MODEL_SCORE_ITEM}{model_key}"
+    print(key_name_prefix)
+    mean_item_h = 0.0
+    count_item_h = 0
+    # 过期时间:一周
+    expire_time = 7 * 24 * 3600
+    with data.open_reader() as reader:
+        for row in tqdm(reader):
+            k = str(row['i_id'])
+            item_features = get_item_features(row)
+            item_h = lr_model.predict_h(item_features)
+            #redis_helper.set_data_to_redis(f"{key_name_prefix}:{k}", item_h, expire_time)
+            item_h_dict[k] = item_h
+            mean_item_h += item_h
+            count_item_h += 1
+            # print(item_features)
+            # print(item_h)
+    mean_item_h = mean_item_h / count_item_h
+    item_h_dict['mean'] = mean_item_h
+    print(mean_item_h)
+    print(count_item_h)
+    k = 'mean'
+    #redis_helper.set_data_to_redis(f"{key_name_prefix}:{k}", mean_item_h, expire_time)
+    with open('{}.{}.v2.json'.format(key_name_prefix, dt), 'w') as fout:
+         json.dump(item_h_dict, fout, indent=2, ensure_ascii=False, sort_keys=True)
+
+
+def timer_check(dt):
+    try:
+        project = config_.ad_model_data['ad_out_v1_item'].get('project')
+        table = config_.ad_model_data['ad_out_v1_item'].get('table')
+        now_date = datetime.datetime.today()
+        yesterday_date = now_date - datetime.timedelta(days=1)
+        now_dt = datetime.datetime.strftime(now_date, '%Y%m%d')
+        yesterday_dt = datetime.datetime.strftime(yesterday_date, '%Y%m%d')
+        log_.info(f"now_dt: {now_dt}")
+        if dt is not None:
+            yesterday_dt = dt
+        log_.info(f"update_dt: {yesterday_dt}")
+        now_min = datetime.datetime.now().minute
+        # 查看当前更新的数据是否已准备好
+        data_count = data_check(project=project, table=table, dt=yesterday_dt)
+        if data_count > 0:
+            log_.info('update_offline_score_item start! {}'.format(data_count))
+            # 数据准备好,进行更新
+            update_offline_score_item(dt=yesterday_dt)
+            log_.info('update_offline_score_item end!')
+        else:
+            # 数据没准备好,5分钟后重新检查
+            wait_seconds = 5 * 60
+            log_.info('data not ready, wait {}s'.format(wait_seconds))
+            Timer(wait_seconds, timer_check).start()
+
+    except Exception as e:
+        log_.error(f"用户广告跳出率预估离线item数据更新失败 exception: {e}, traceback: {traceback.format_exc()}")
+        send_msg_to_feishu(
+            webhook=config_.FEISHU_ROBOT['server_robot'].get('webhook'),
+            key_word=config_.FEISHU_ROBOT['server_robot'].get('key_word'),
+            msg_text=f"rov-offline{config_.ENV_TEXT} - 用户广告跳出率预估离线item数据更新失败\n"
+                     f"exception: {e}\n"
+                     f"traceback: {traceback.format_exc()}"
+        )
+
+
+if __name__ == "__main__":
+    dt = None
+    if len(sys.argv) > 1:
+        dt = sys.argv[1]
+        log_.info('## 手动更新:{}'.format(dt))
+    else:
+        log_.info('## 自动更新')
+    timer_check(dt)
+
+

+ 67 - 163
ad_out_v1_get_offline_score_user.py

@@ -1,8 +1,7 @@
 #coding utf-8
-from tqdm import tqdm
 import sys
 import json
-
+import datetime
 import traceback
 from threading import Timer
 from tqdm import tqdm
@@ -17,45 +16,24 @@ from feature import get_user_features
 from lr_model import LrModel
 from utils import exe_sql
 
-if __name__ == "__main__":
+def update_offline_score_user(dt):
     project = 'loghubods'
-    datetime = sys.argv[1]
     sql = """
---odps sql 
+--odps sql
 --********************************************************************--
 --author:研发
---create time:2023-12-01 15:48:17
+--create time:2023-12-11 23:54:20
 --********************************************************************--
-with candidate as (
+with candidate_user as (
 select
 -- 基础特征_用户
-mid AS u_id
+mids AS u_id
 ,machineinfo_brand AS u_brand
 ,machineinfo_model AS u_device
 ,SPLIT(machineinfo_system,' ')[0] AS u_system
 ,machineinfo_system AS u_system_ver
--- 基础特征_视频
-,videoid AS i_id
-,i_up_id AS i_up_id
-,tags as i_tag
-,title as i_title
-,ceil(log2(i_title_len + 1)) as i_title_len
-,ceil(log2(total_time + 1)) as i_play_len
-,ceil(log2(i_days_since_upload + 1)) as i_days_since_upload -- 发布时间(距离现在天数)
--- 基础特征_场景
-,apptype AS ctx_apptype
-,ctx_day AS ctx_day
-,ctx_week AS ctx_week
-,ctx_hour AS ctx_hour
-,ctx_region as ctx_region
-,ctx_city as ctx_city
--- 基础特征_交叉
-,ui_is_out as ui_is_out
-,i_play_len as playtime
--- ,IF(i_play_len > 1,'0','1') AS ui_is_out_new
-,rootmid AS ui_root_id
-,shareid AS ui_share_id
--- 统计特征_用户
+,province as ctx_region
+,city as ctx_city
 ,u_cycle_bucket_7days
 ,u_cycle_bucket_30days
 ,u_share_bucket_30days
@@ -91,139 +69,18 @@ mid AS u_id
 ,round(if(u_str_3month > 10.0, 10.0, u_str_3month) / 10.0, 6) as u_str_3month
 ,round(if(u_rov_3month > 10.0, 10.0, u_rov_3month) / 10.0, 6) as u_rov_3month
 ,round(if(u_ros_3month > 10.0, 10.0, u_ros_3month) / 10.0, 6) as u_ros_3month
--- 统计特征_视频
-,ceil(log2(i_1day_exp_cnt + 1)) as i_1day_exp_cnt
-,ceil(log2(i_1day_click_cnt + 1)) as i_1day_click_cnt
-,ceil(log2(i_1day_share_cnt + 1)) as i_1day_share_cnt
-,ceil(log2(i_1day_return_cnt + 1)) as i_1day_return_cnt
-,ceil(log2(i_3day_exp_cnt + 1)) as i_3day_exp_cnt
-,ceil(log2(i_3day_click_cnt + 1)) as i_3day_click_cnt
-,ceil(log2(i_3day_share_cnt + 1)) as i_3day_share_cnt
-,ceil(log2(i_3day_return_cnt + 1)) as i_3day_return_cnt
-,ceil(log2(i_7day_exp_cnt + 1)) as i_7day_exp_cnt
-,ceil(log2(i_7day_click_cnt + 1)) as i_7day_click_cnt
-,ceil(log2(i_7day_share_cnt + 1)) as i_7day_share_cnt
-,ceil(log2(i_7day_return_cnt + 1)) as i_7day_return_cnt
-,ceil(log2(i_3month_exp_cnt + 1)) as i_3month_exp_cnt
-,ceil(log2(i_3month_click_cnt + 1)) as i_3month_click_cnt
-,ceil(log2(i_3month_share_cnt + 1)) as i_3month_share_cnt
-,ceil(log2(i_3month_return_cnt + 1)) as i_3month_return_cnt
-,round(if(i_ctr_1day > 10.0, 10.0, i_ctr_1day) / 10.0, 6) as i_ctr_1day
-,round(if(i_str_1day > 10.0, 10.0, i_str_1day) / 10.0, 6) as i_str_1day
-,round(if(i_rov_1day > 10.0, 10.0, i_rov_1day) / 10.0, 6) as i_rov_1day
-,round(if(i_ros_1day > 10.0, 10.0, i_ros_1day) / 10.0, 6) as i_ros_1day
-,round(if(i_ctr_3day > 10.0, 10.0, i_ctr_3day) / 10.0, 6) as i_ctr_3day
-,round(if(i_str_3day > 10.0, 10.0, i_str_3day) / 10.0, 6) as i_str_3day
-,round(if(i_rov_3day > 10.0, 10.0, i_rov_3day) / 10.0, 6) as i_rov_3day
-,round(if(i_ros_3day > 10.0, 10.0, i_ros_3day) / 10.0, 6) as i_ros_3day
-,round(if(i_ctr_7day > 10.0, 10.0, i_ctr_7day) / 10.0, 6) as i_ctr_7day
-,round(if(i_str_7day > 10.0, 10.0, i_str_7day) / 10.0, 6) as i_str_7day
-,round(if(i_rov_7day > 10.0, 10.0, i_rov_7day) / 10.0, 6) as i_rov_7day
-,round(if(i_ros_7day > 10.0, 10.0, i_ros_7day) / 10.0, 6) as i_ros_7day
-,round(if(i_ctr_3month > 10.0, 10.0, i_ctr_3month) / 10.0, 6) as i_ctr_3month
-,round(if(i_str_3month > 10.0, 10.0, i_str_3month) / 10.0, 6) as i_str_3month
-,round(if(i_rov_3month > 10.0, 10.0, i_rov_3month) / 10.0, 6) as i_rov_3month
-,round(if(i_ros_3month > 10.0, 10.0, i_ros_3month) / 10.0, 6) as i_ros_3month
 from
-loghubods.user_video_features_data_final
-where dt='{datetime}'
-and ad_ornot = '0'
-and apptype != '13'
-), candidate_user as (
-    SELECT 
-    u_id,
-    max(u_brand) as u_brand,
-    max(u_device) as u_device,
-    max(u_system) as u_system,
-    max(u_system_ver) as u_system_ver,
-    max(ctx_region) as ctx_region,
-    max(ctx_city) as ctx_city,
-    max(u_cycle_bucket_7days) as u_cycle_bucket_7days,
-    max(u_cycle_bucket_30days) as u_cycle_bucket_30days,
-    max(u_share_bucket_30days) as u_share_bucket_30days,
-    max(u_1day_exp_cnt) as u_1day_exp_cnt,
-    max(u_1day_click_cnt) as u_1day_click_cnt,
-    max(u_1day_share_cnt) as u_1day_share_cnt,
-    max(u_1day_return_cnt) as u_1day_return_cnt,
-    max(u_3day_exp_cnt) as u_3day_exp_cnt,
-    max(u_3day_click_cnt) as u_3day_click_cnt,
-    max(u_3day_share_cnt) as u_3day_share_cnt,
-    max(u_3day_return_cnt) as u_3day_return_cnt,
-    max(u_7day_exp_cnt) as u_7day_exp_cnt,
-    max(u_7day_click_cnt) as u_7day_click_cnt,
-    max(u_7day_share_cnt) as u_7day_share_cnt,
-    max(u_7day_return_cnt) as u_7day_return_cnt,
-    max(u_3month_exp_cnt) as u_3month_exp_cnt,
-    max(u_3month_click_cnt) as u_3month_click_cnt,
-    max(u_3month_share_cnt) as u_3month_share_cnt,
-    max(u_3month_return_cnt) as u_3month_return_cnt,
-    max(u_ctr_1day) as u_ctr_1day,
-    max(u_str_1day) as u_str_1day,
-    max(u_rov_1day) as u_rov_1day,
-    max(u_ros_1day) as u_ros_1day,
-    max(u_ctr_3day) as u_ctr_3day,
-    max(u_str_3day) as u_str_3day,
-    max(u_rov_3day) as u_rov_3day,
-    max(u_ros_3day) as u_ros_3day,
-    max(u_ctr_7day) as u_ctr_7day,
-    max(u_str_7day) as u_str_7day,
-    max(u_rov_7day) as u_rov_7day,
-    max(u_ros_7day) as u_ros_7day,
-    max(u_ctr_3month) as u_ctr_3month,
-    max(u_str_3month) as u_str_3month,
-    max(u_rov_3month) as u_rov_3month,
-    max(u_ros_3month) as u_ros_3month
-    FROM 
-    candidate
-    group by u_id
-), candidate_item as (
-    select
-    i_id,
-    max(i_up_id) as i_up_id,
-    max(i_title_len) as i_title_len,
-    max(i_play_len) as i_play_len,
-    max(i_days_since_upload) as i_days_since_upload,
-    max(i_1day_exp_cnt) as i_1day_exp_cnt,
-    max(i_1day_click_cnt) as i_1day_click_cnt,
-    max(i_1day_share_cnt) as i_1day_share_cnt,
-    max(i_1day_return_cnt) as i_1day_return_cnt,
-    max(i_3day_exp_cnt) as i_3day_exp_cnt,
-    max(i_3day_click_cnt) as i_3day_click_cnt,
-    max(i_3day_share_cnt) as i_3day_share_cnt,
-    max(i_3day_return_cnt) as i_3day_return_cnt,
-    max(i_7day_exp_cnt) as i_7day_exp_cnt,
-    max(i_7day_click_cnt) as i_7day_click_cnt,
-    max(i_7day_share_cnt) as i_7day_share_cnt,
-    max(i_7day_return_cnt) as i_7day_return_cnt,
-    max(i_3month_exp_cnt) as i_3month_exp_cnt,
-    max(i_3month_click_cnt) as i_3month_click_cnt,
-    max(i_3month_share_cnt) as i_3month_share_cnt,
-    max(i_3month_return_cnt) as i_3month_return_cnt,
-    max(i_ctr_1day) as i_ctr_1day,
-    max(i_str_1day) as i_str_1day,
-    max(i_rov_1day) as i_rov_1day,
-    max(i_ros_1day) as i_ros_1day,
-    max(i_ctr_3day) as i_ctr_3day,
-    max(i_str_3day) as i_str_3day,
-    max(i_rov_3day) as i_rov_3day,
-    max(i_ros_3day) as i_ros_3day,
-    max(i_ctr_7day) as i_ctr_7day,
-    max(i_str_7day) as i_str_7day,
-    max(i_rov_7day) as i_rov_7day,
-    max(i_ros_7day) as i_ros_7day,
-    max(i_ctr_3month) as i_ctr_3month,
-    max(i_str_3month) as i_str_3month,
-    max(i_rov_3month) as i_rov_3month,
-    max(i_ros_3month) as i_ros_3month
-    FROM 
-    candidate
-    group by i_id
+loghubods.alg_recsys_user_info
+where dt='{dt}'
+and length(mids) > 0
+and (u_3month_share_cnt > 0 or u_7day_click_cnt > 0 or u_3day_exp_cnt > 0)
 )
 SELECT
+u_id as k,
 *
 from candidate_user
-    """.format(datetime=datetime)
-    # print(sql)
+    """.format(dt=dt)
+    # log_.info(sql)
     data = exe_sql(project, sql)
     print('sql done')
     # data.to_csv('./data/ad_out_sample_v2_item.{datetime}'.format(datetime=datetime), sep='\t')
@@ -232,7 +89,6 @@ from candidate_user
     lr_model = LrModel('model/{}.json'.format(model_key))
     user_h_dict = {}
     k_col = 'u_id'
-    dt = datetime
     key_name_prefix = f"{config_.KEY_NAME_PREFIX_AD_OUT_MODEL_SCORE_USER}{model_key}"
     print(key_name_prefix)
     mean_user_h = 0.0
@@ -244,7 +100,7 @@ from candidate_user
             k = str(row['u_id'])
             user_features = get_user_features(row)
             user_h = lr_model.predict_h(user_features)
-            redis_helper.set_data_to_redis(f"{key_name_prefix}:{k}", user_h, expire_time)
+            # redis_helper.set_data_to_redis(f"{key_name_prefix}:{k}", user_h, expire_time)
             user_h_dict[k] = user_h
             mean_user_h += user_h
             count_user_h += 1
@@ -255,7 +111,55 @@ from candidate_user
     print(mean_user_h)
     print(count_user_h)
     k = 'mean'
-    redis_helper.set_data_to_redis(f"{key_name_prefix}:{k}", mean_user_h, expire_time)
-    # with open('{}.json'.format(key_name_prefix), 'w') as fout:
-    #     json.dump(user_h_dict, fout, indent=2, ensure_ascii=False, sort_keys=True)
+    #redis_helper.set_data_to_redis(f"{key_name_prefix}:{k}", mean_user_h, expire_time)
+    with open('{}.{}.json'.format(key_name_prefix, dt), 'w') as fout:
+        json.dump(user_h_dict, fout, indent=2, ensure_ascii=False, sort_keys=True)
+
+
+def timer_check(dt):
+    try:
+        project = config_.ad_model_data['ad_out_v1'].get('project')
+        table = config_.ad_model_data['ad_out_v1'].get('table')
+        now_date = datetime.datetime.today()
+        yesterday_date = now_date - datetime.timedelta(days=1)
+        now_dt = datetime.datetime.strftime(now_date, '%Y%m%d')
+        yesterday_dt = datetime.datetime.strftime(yesterday_date, '%Y%m%d')
+        log_.info(f"now_dt: {now_dt}")
+        if dt is not None:
+            yesterday_dt = dt
+        log_.info(f"update_dt: {yesterday_dt}")
+        now_min = datetime.datetime.now().minute
+        # 查看当前更新的数据是否已准备好
+        data_count = data_check(project=project, table=table, dt=yesterday_dt)
+        if data_count > 0:
+            log_.info('update_offline_score_user start! {}'.format(data_count))
+            # 数据准备好,进行更新
+            update_offline_score_user(dt=yesterday_dt)
+            log_.info('update_offline_score_user end!')
+        else:
+            # 数据没准备好,5分钟后重新检查
+            wait_seconds = 5 * 60
+            log_.info('data not ready, wait {}s'.format(wait_seconds))
+            Timer(wait_seconds, timer_check).start()
+
+    except Exception as e:
+        log_.error(f"用户广告跳出率预估离线user数据更新失败 exception: {e}, traceback: {traceback.format_exc()}")
+        send_msg_to_feishu(
+            webhook=config_.FEISHU_ROBOT['server_robot'].get('webhook'),
+            key_word=config_.FEISHU_ROBOT['server_robot'].get('key_word'),
+            msg_text=f"rov-offline{config_.ENV_TEXT} - 用户广告跳出率预估离线user数据更新失败\n"
+                     f"exception: {e}\n"
+                     f"traceback: {traceback.format_exc()}"
+        )
+
+
+if __name__ == "__main__":
+    dt = None
+    if len(sys.argv) > 1:
+        dt = sys.argv[1]
+        log_.info('## 手动更新:{}'.format(dt))
+    else:
+        log_.info('## 自动更新')
+    timer_check(dt)
+
 

+ 9 - 2
ad_out_v1_get_offline_score_user.sh

@@ -1,3 +1,10 @@
-
-/root/anaconda3/bin/python ad_out_v1_get_offline_score_user.py $*
+source /etc/profile
+echo $ROV_OFFLINE_ENV
+if [[ $ROV_OFFLINE_ENV == 'test' ]]; then
+    cd /data2/rov-offline &&
+    /root/anaconda3/bin/python /data2/rov-offline/ad_out_v1_get_offline_score_user_new.py $*
+elif [[ $ROV_OFFLINE_ENV == 'pro' ]]; then
+    cd /data/rov-offline &&
+    /root/anaconda3/bin/python /data/rov-offline/ad_out_v1_get_offline_score_user_new.py $*
+fi
 

+ 17 - 0
ad_out_v1_get_offline_score_user_batch.sh

@@ -0,0 +1,17 @@
+source /etc/profile
+echo $ROV_OFFLINE_ENV
+d=2023-12-01
+while [ "$d" != 2023-12-12 ]; do 
+  datetime=$(date -d "$d" +%Y%m%d)
+  echo $datetime
+    if [[ $ROV_OFFLINE_ENV == 'test' ]]; then
+        cd /data2/rov-offline &&
+        /root/anaconda3/bin/python /data2/rov-offline/ad_out_v1_get_offline_score_user_new.py $datetime
+    elif [[ $ROV_OFFLINE_ENV == 'pro' ]]; then
+        cd /data/rov-offline &&
+        /root/anaconda3/bin/python /data/rov-offline/ad_out_v1_get_offline_score_user_new.py $datetime
+    fi
+  d=$(date -I -d "$d + 1 day")
+done
+echo "done"
+

+ 291 - 0
ad_out_v1_get_offline_score_user_new.py

@@ -0,0 +1,291 @@
+#coding utf-8
+import sys
+import datetime
+import traceback
+from threading import Timer
+from tqdm import tqdm
+from utils import RedisHelper, data_check, get_feature_data, send_msg_to_feishu
+from config import set_config
+from log import Log
+from records_process import records_process
+
+config_, _ = set_config()
+log_ = Log()
+redis_helper = RedisHelper()
+
+from feature import get_user_features as get_features
+from lr_model import LrModel
+from utils import exe_sql
+
+model_key = 'ad_out_v1'
+lr_model = LrModel('model/{}.json'.format(model_key))
+item_h_dict = {}
+key_name_prefix = f"{config_.KEY_NAME_PREFIX_AD_OUT_MODEL_SCORE_USER}{model_key}"
+print(key_name_prefix)
+# 过期时间:一周
+expire_time = 7 * 24 * 3600
+
+def process_and_store(row):
+    k = str(row['k'])
+    features = get_features(row)
+    h = lr_model.predict_h(features)
+    redis_helper.set_data_to_redis(f"{key_name_prefix}:{k}", round(h, 6), expire_time)
+
+def update_offline_score_user(dt):
+    project = 'loghubods'
+    sql = """
+--odps sql 
+--********************************************************************--
+--author:研发
+--create time:2023-12-01 15:48:17
+--********************************************************************--
+with candidate as (
+select
+-- 基础特征_用户
+mid AS u_id
+,machineinfo_brand AS u_brand
+,machineinfo_model AS u_device
+,SPLIT(machineinfo_system,' ')[0] AS u_system
+,machineinfo_system AS u_system_ver
+-- 基础特征_视频
+,videoid AS i_id
+,i_up_id AS i_up_id
+,tags as i_tag
+,title as i_title
+,ceil(log2(i_title_len + 1)) as i_title_len
+,ceil(log2(total_time + 1)) as i_play_len
+,ceil(log2(i_days_since_upload + 1)) as i_days_since_upload -- 发布时间(距离现在天数)
+-- 基础特征_场景
+,apptype AS ctx_apptype
+,ctx_day AS ctx_day
+,ctx_week AS ctx_week
+,ctx_hour AS ctx_hour
+,ctx_region as ctx_region
+,ctx_city as ctx_city
+-- 基础特征_交叉
+,ui_is_out as ui_is_out
+,i_play_len as playtime
+-- ,IF(i_play_len > 1,'0','1') AS ui_is_out_new
+,rootmid AS ui_root_id
+,shareid AS ui_share_id
+-- 统计特征_用户
+,u_cycle_bucket_7days
+,u_cycle_bucket_30days
+,u_share_bucket_30days
+,ceil(log2(u_1day_exp_cnt + 1)) as u_1day_exp_cnt
+,ceil(log2(u_1day_click_cnt + 1)) as u_1day_click_cnt
+,ceil(log2(u_1day_share_cnt + 1)) as u_1day_share_cnt
+,ceil(log2(u_1day_return_cnt + 1)) as u_1day_return_cnt
+,ceil(log2(u_3day_exp_cnt + 1)) as u_3day_exp_cnt
+,ceil(log2(u_3day_click_cnt + 1)) as u_3day_click_cnt
+,ceil(log2(u_3day_share_cnt + 1)) as u_3day_share_cnt
+,ceil(log2(u_3day_return_cnt + 1)) as u_3day_return_cnt
+,ceil(log2(u_7day_exp_cnt + 1)) as u_7day_exp_cnt
+,ceil(log2(u_7day_click_cnt + 1)) as u_7day_click_cnt
+,ceil(log2(u_7day_share_cnt + 1)) as u_7day_share_cnt
+,ceil(log2(u_7day_return_cnt + 1)) as u_7day_return_cnt
+,ceil(log2(u_3month_exp_cnt + 1)) as u_3month_exp_cnt
+,ceil(log2(u_3month_click_cnt + 1)) as u_3month_click_cnt
+,ceil(log2(u_3month_share_cnt + 1)) as u_3month_share_cnt
+,ceil(log2(u_3month_return_cnt + 1)) as u_3month_return_cnt
+,round(if(u_ctr_1day > 10.0, 10.0, u_ctr_1day) / 10.0, 6) as u_ctr_1day
+,round(if(u_str_1day > 10.0, 10.0, u_str_1day) / 10.0, 6) as u_str_1day
+,round(if(u_rov_1day > 10.0, 10.0, u_rov_1day) / 10.0, 6) as u_rov_1day
+,round(if(u_ros_1day > 10.0, 10.0, u_ros_1day) / 10.0, 6) as u_ros_1day
+,round(if(u_ctr_3day > 10.0, 10.0, u_ctr_3day) / 10.0, 6) as u_ctr_3day
+,round(if(u_str_3day > 10.0, 10.0, u_str_3day) / 10.0, 6) as u_str_3day
+,round(if(u_rov_3day > 10.0, 10.0, u_rov_3day) / 10.0, 6) as u_rov_3day
+,round(if(u_ros_3day > 10.0, 10.0, u_ros_3day) / 10.0, 6) as u_ros_3day
+,round(if(u_ctr_7day > 10.0, 10.0, u_ctr_7day) / 10.0, 6) as u_ctr_7day
+,round(if(u_str_7day > 10.0, 10.0, u_str_7day) / 10.0, 6) as u_str_7day
+,round(if(u_rov_7day > 10.0, 10.0, u_rov_7day) / 10.0, 6) as u_rov_7day
+,round(if(u_ros_7day > 10.0, 10.0, u_ros_7day) / 10.0, 6) as u_ros_7day
+,round(if(u_ctr_3month > 10.0, 10.0, u_ctr_3month) / 10.0, 6) as u_ctr_3month
+,round(if(u_str_3month > 10.0, 10.0, u_str_3month) / 10.0, 6) as u_str_3month
+,round(if(u_rov_3month > 10.0, 10.0, u_rov_3month) / 10.0, 6) as u_rov_3month
+,round(if(u_ros_3month > 10.0, 10.0, u_ros_3month) / 10.0, 6) as u_ros_3month
+-- 统计特征_视频
+,ceil(log2(i_1day_exp_cnt + 1)) as i_1day_exp_cnt
+,ceil(log2(i_1day_click_cnt + 1)) as i_1day_click_cnt
+,ceil(log2(i_1day_share_cnt + 1)) as i_1day_share_cnt
+,ceil(log2(i_1day_return_cnt + 1)) as i_1day_return_cnt
+,ceil(log2(i_3day_exp_cnt + 1)) as i_3day_exp_cnt
+,ceil(log2(i_3day_click_cnt + 1)) as i_3day_click_cnt
+,ceil(log2(i_3day_share_cnt + 1)) as i_3day_share_cnt
+,ceil(log2(i_3day_return_cnt + 1)) as i_3day_return_cnt
+,ceil(log2(i_7day_exp_cnt + 1)) as i_7day_exp_cnt
+,ceil(log2(i_7day_click_cnt + 1)) as i_7day_click_cnt
+,ceil(log2(i_7day_share_cnt + 1)) as i_7day_share_cnt
+,ceil(log2(i_7day_return_cnt + 1)) as i_7day_return_cnt
+,ceil(log2(i_3month_exp_cnt + 1)) as i_3month_exp_cnt
+,ceil(log2(i_3month_click_cnt + 1)) as i_3month_click_cnt
+,ceil(log2(i_3month_share_cnt + 1)) as i_3month_share_cnt
+,ceil(log2(i_3month_return_cnt + 1)) as i_3month_return_cnt
+,round(if(i_ctr_1day > 10.0, 10.0, i_ctr_1day) / 10.0, 6) as i_ctr_1day
+,round(if(i_str_1day > 10.0, 10.0, i_str_1day) / 10.0, 6) as i_str_1day
+,round(if(i_rov_1day > 10.0, 10.0, i_rov_1day) / 10.0, 6) as i_rov_1day
+,round(if(i_ros_1day > 10.0, 10.0, i_ros_1day) / 10.0, 6) as i_ros_1day
+,round(if(i_ctr_3day > 10.0, 10.0, i_ctr_3day) / 10.0, 6) as i_ctr_3day
+,round(if(i_str_3day > 10.0, 10.0, i_str_3day) / 10.0, 6) as i_str_3day
+,round(if(i_rov_3day > 10.0, 10.0, i_rov_3day) / 10.0, 6) as i_rov_3day
+,round(if(i_ros_3day > 10.0, 10.0, i_ros_3day) / 10.0, 6) as i_ros_3day
+,round(if(i_ctr_7day > 10.0, 10.0, i_ctr_7day) / 10.0, 6) as i_ctr_7day
+,round(if(i_str_7day > 10.0, 10.0, i_str_7day) / 10.0, 6) as i_str_7day
+,round(if(i_rov_7day > 10.0, 10.0, i_rov_7day) / 10.0, 6) as i_rov_7day
+,round(if(i_ros_7day > 10.0, 10.0, i_ros_7day) / 10.0, 6) as i_ros_7day
+,round(if(i_ctr_3month > 10.0, 10.0, i_ctr_3month) / 10.0, 6) as i_ctr_3month
+,round(if(i_str_3month > 10.0, 10.0, i_str_3month) / 10.0, 6) as i_str_3month
+,round(if(i_rov_3month > 10.0, 10.0, i_rov_3month) / 10.0, 6) as i_rov_3month
+,round(if(i_ros_3month > 10.0, 10.0, i_ros_3month) / 10.0, 6) as i_ros_3month
+from
+loghubods.user_video_features_data_final
+where dt='{dt}'
+and ad_ornot = '0'
+and apptype != '13'
+), candidate_user as (
+    SELECT 
+    u_id,
+    max(u_brand) as u_brand,
+    max(u_device) as u_device,
+    max(u_system) as u_system,
+    max(u_system_ver) as u_system_ver,
+    max(ctx_region) as ctx_region,
+    max(ctx_city) as ctx_city,
+    max(u_cycle_bucket_7days) as u_cycle_bucket_7days,
+    max(u_cycle_bucket_30days) as u_cycle_bucket_30days,
+    max(u_share_bucket_30days) as u_share_bucket_30days,
+    max(u_1day_exp_cnt) as u_1day_exp_cnt,
+    max(u_1day_click_cnt) as u_1day_click_cnt,
+    max(u_1day_share_cnt) as u_1day_share_cnt,
+    max(u_1day_return_cnt) as u_1day_return_cnt,
+    max(u_3day_exp_cnt) as u_3day_exp_cnt,
+    max(u_3day_click_cnt) as u_3day_click_cnt,
+    max(u_3day_share_cnt) as u_3day_share_cnt,
+    max(u_3day_return_cnt) as u_3day_return_cnt,
+    max(u_7day_exp_cnt) as u_7day_exp_cnt,
+    max(u_7day_click_cnt) as u_7day_click_cnt,
+    max(u_7day_share_cnt) as u_7day_share_cnt,
+    max(u_7day_return_cnt) as u_7day_return_cnt,
+    max(u_3month_exp_cnt) as u_3month_exp_cnt,
+    max(u_3month_click_cnt) as u_3month_click_cnt,
+    max(u_3month_share_cnt) as u_3month_share_cnt,
+    max(u_3month_return_cnt) as u_3month_return_cnt,
+    max(u_ctr_1day) as u_ctr_1day,
+    max(u_str_1day) as u_str_1day,
+    max(u_rov_1day) as u_rov_1day,
+    max(u_ros_1day) as u_ros_1day,
+    max(u_ctr_3day) as u_ctr_3day,
+    max(u_str_3day) as u_str_3day,
+    max(u_rov_3day) as u_rov_3day,
+    max(u_ros_3day) as u_ros_3day,
+    max(u_ctr_7day) as u_ctr_7day,
+    max(u_str_7day) as u_str_7day,
+    max(u_rov_7day) as u_rov_7day,
+    max(u_ros_7day) as u_ros_7day,
+    max(u_ctr_3month) as u_ctr_3month,
+    max(u_str_3month) as u_str_3month,
+    max(u_rov_3month) as u_rov_3month,
+    max(u_ros_3month) as u_ros_3month
+    FROM 
+    candidate
+    group by u_id
+), candidate_item as (
+    select
+    i_id,
+    max(i_up_id) as i_up_id,
+    max(i_title_len) as i_title_len,
+    max(i_play_len) as i_play_len,
+    max(i_days_since_upload) as i_days_since_upload,
+    max(i_1day_exp_cnt) as i_1day_exp_cnt,
+    max(i_1day_click_cnt) as i_1day_click_cnt,
+    max(i_1day_share_cnt) as i_1day_share_cnt,
+    max(i_1day_return_cnt) as i_1day_return_cnt,
+    max(i_3day_exp_cnt) as i_3day_exp_cnt,
+    max(i_3day_click_cnt) as i_3day_click_cnt,
+    max(i_3day_share_cnt) as i_3day_share_cnt,
+    max(i_3day_return_cnt) as i_3day_return_cnt,
+    max(i_7day_exp_cnt) as i_7day_exp_cnt,
+    max(i_7day_click_cnt) as i_7day_click_cnt,
+    max(i_7day_share_cnt) as i_7day_share_cnt,
+    max(i_7day_return_cnt) as i_7day_return_cnt,
+    max(i_3month_exp_cnt) as i_3month_exp_cnt,
+    max(i_3month_click_cnt) as i_3month_click_cnt,
+    max(i_3month_share_cnt) as i_3month_share_cnt,
+    max(i_3month_return_cnt) as i_3month_return_cnt,
+    max(i_ctr_1day) as i_ctr_1day,
+    max(i_str_1day) as i_str_1day,
+    max(i_rov_1day) as i_rov_1day,
+    max(i_ros_1day) as i_ros_1day,
+    max(i_ctr_3day) as i_ctr_3day,
+    max(i_str_3day) as i_str_3day,
+    max(i_rov_3day) as i_rov_3day,
+    max(i_ros_3day) as i_ros_3day,
+    max(i_ctr_7day) as i_ctr_7day,
+    max(i_str_7day) as i_str_7day,
+    max(i_rov_7day) as i_rov_7day,
+    max(i_ros_7day) as i_ros_7day,
+    max(i_ctr_3month) as i_ctr_3month,
+    max(i_str_3month) as i_str_3month,
+    max(i_rov_3month) as i_rov_3month,
+    max(i_ros_3month) as i_ros_3month
+    FROM 
+    candidate
+    group by i_id
+)
+SELECT
+u_id as k,
+*
+from candidate_user
+    """.format(dt=dt)
+    # log_.info(sql)
+    records = exe_sql(project, sql)
+    log_.info('sql_done')
+    records_process(records, process_and_store, max_size=50, num_workers=10)
+
+def timer_check(dt):
+    try:
+        project = config_.ad_model_data['ad_out_v1'].get('project')
+        table = config_.ad_model_data['ad_out_v1'].get('table')
+        now_date = datetime.datetime.today()
+        yesterday_date = now_date - datetime.timedelta(days=1)
+        now_dt = datetime.datetime.strftime(now_date, '%Y%m%d')
+        yesterday_dt = datetime.datetime.strftime(yesterday_date, '%Y%m%d')
+        log_.info(f"now_dt: {now_dt}")
+        if dt is not None:
+            yesterday_dt = dt
+        log_.info(f"update_dt: {yesterday_dt}")
+        now_min = datetime.datetime.now().minute
+        # 查看当前更新的数据是否已准备好
+        data_count = data_check(project=project, table=table, dt=yesterday_dt)
+        if data_count > 0:
+            log_.info('update_offline_score_user start! {}'.format(data_count))
+            # 数据准备好,进行更新
+            update_offline_score_user(dt=yesterday_dt)
+            log_.info('update_offline_score_user end!')
+        else:
+            # 数据没准备好,5分钟后重新检查
+            wait_seconds = 5 * 60
+            log_.info('data not ready, wait {}s'.format(wait_seconds))
+            Timer(wait_seconds, timer_check, args=(dt,)).start()
+
+    except Exception as e:
+        log_.error(f"用户广告跳出率预估离线user数据更新失败 exception: {e}, traceback: {traceback.format_exc()}")
+        send_msg_to_feishu(
+            webhook=config_.FEISHU_ROBOT['server_robot'].get('webhook'),
+            key_word=config_.FEISHU_ROBOT['server_robot'].get('key_word'),
+            msg_text=f"rov-offline{config_.ENV_TEXT} - 用户广告跳出率预估离线user数据更新失败\n"
+                     f"exception: {e}\n"
+                     f"traceback: {traceback.format_exc()}"
+        )
+
+
+if __name__ == "__main__":
+    dt = None
+    if len(sys.argv) > 1:
+        dt = sys.argv[1]
+        log_.info('## 手动更新:{}'.format(dt))
+    else:
+        log_.info('## 自动更新')
+    timer_check(dt)
+
+

+ 149 - 0
ad_out_v1_get_offline_score_user_v2.py

@@ -0,0 +1,149 @@
+#coding utf-8
+import sys
+import datetime
+import traceback
+from threading import Timer
+from tqdm import tqdm
+from utils import RedisHelper, data_check, get_feature_data, send_msg_to_feishu
+from config import set_config
+from log import Log
+from records_process import records_process
+
+config_, _ = set_config()
+log_ = Log()
+redis_helper = RedisHelper()
+
+from feature import get_user_features as get_features
+from lr_model import LrModel
+from utils import exe_sql
+
+model_key = 'ad_out_v1'
+lr_model = LrModel('model/{}.json'.format(model_key))
+item_h_dict = {}
+key_name_prefix = f"{config_.KEY_NAME_PREFIX_AD_OUT_MODEL_SCORE_USER}{model_key}"
+print(key_name_prefix)
+# 过期时间:一周
+expire_time = 7 * 24 * 3600
+
+def process_and_store(row):
+    k = str(row['k'])
+    features = get_features(row)
+    h = lr_model.predict_h(features)
+    redis_helper.set_data_to_redis(f"{key_name_prefix}:{k}", round(h, 6), expire_time)
+
+def update_offline_score_user(dt):
+    project = 'loghubods'
+    sql = """
+--odps sql 
+--********************************************************************--
+--author:研发
+--create time:2023-12-11 23:54:20
+--********************************************************************--
+with candidate_user as (
+select
+-- 基础特征_用户
+mids AS u_id
+,machineinfo_brand AS u_brand
+,machineinfo_model AS u_device
+,SPLIT(machineinfo_system,' ')[0] AS u_system
+,machineinfo_system AS u_system_ver
+,province as ctx_region
+,city as ctx_city
+,u_cycle_bucket_7days
+,u_cycle_bucket_30days
+,u_share_bucket_30days
+,ceil(log2(u_1day_exp_cnt + 1)) as u_1day_exp_cnt
+,ceil(log2(u_1day_click_cnt + 1)) as u_1day_click_cnt
+,ceil(log2(u_1day_share_cnt + 1)) as u_1day_share_cnt
+,ceil(log2(u_1day_return_cnt + 1)) as u_1day_return_cnt
+,ceil(log2(u_3day_exp_cnt + 1)) as u_3day_exp_cnt
+,ceil(log2(u_3day_click_cnt + 1)) as u_3day_click_cnt
+,ceil(log2(u_3day_share_cnt + 1)) as u_3day_share_cnt
+,ceil(log2(u_3day_return_cnt + 1)) as u_3day_return_cnt
+,ceil(log2(u_7day_exp_cnt + 1)) as u_7day_exp_cnt
+,ceil(log2(u_7day_click_cnt + 1)) as u_7day_click_cnt
+,ceil(log2(u_7day_share_cnt + 1)) as u_7day_share_cnt
+,ceil(log2(u_7day_return_cnt + 1)) as u_7day_return_cnt
+,ceil(log2(u_3month_exp_cnt + 1)) as u_3month_exp_cnt
+,ceil(log2(u_3month_click_cnt + 1)) as u_3month_click_cnt
+,ceil(log2(u_3month_share_cnt + 1)) as u_3month_share_cnt
+,ceil(log2(u_3month_return_cnt + 1)) as u_3month_return_cnt
+,round(if(u_ctr_1day > 10.0, 10.0, u_ctr_1day) / 10.0, 6) as u_ctr_1day
+,round(if(u_str_1day > 10.0, 10.0, u_str_1day) / 10.0, 6) as u_str_1day
+,round(if(u_rov_1day > 10.0, 10.0, u_rov_1day) / 10.0, 6) as u_rov_1day
+,round(if(u_ros_1day > 10.0, 10.0, u_ros_1day) / 10.0, 6) as u_ros_1day
+,round(if(u_ctr_3day > 10.0, 10.0, u_ctr_3day) / 10.0, 6) as u_ctr_3day
+,round(if(u_str_3day > 10.0, 10.0, u_str_3day) / 10.0, 6) as u_str_3day
+,round(if(u_rov_3day > 10.0, 10.0, u_rov_3day) / 10.0, 6) as u_rov_3day
+,round(if(u_ros_3day > 10.0, 10.0, u_ros_3day) / 10.0, 6) as u_ros_3day
+,round(if(u_ctr_7day > 10.0, 10.0, u_ctr_7day) / 10.0, 6) as u_ctr_7day
+,round(if(u_str_7day > 10.0, 10.0, u_str_7day) / 10.0, 6) as u_str_7day
+,round(if(u_rov_7day > 10.0, 10.0, u_rov_7day) / 10.0, 6) as u_rov_7day
+,round(if(u_ros_7day > 10.0, 10.0, u_ros_7day) / 10.0, 6) as u_ros_7day
+,round(if(u_ctr_3month > 10.0, 10.0, u_ctr_3month) / 10.0, 6) as u_ctr_3month
+,round(if(u_str_3month > 10.0, 10.0, u_str_3month) / 10.0, 6) as u_str_3month
+,round(if(u_rov_3month > 10.0, 10.0, u_rov_3month) / 10.0, 6) as u_rov_3month
+,round(if(u_ros_3month > 10.0, 10.0, u_ros_3month) / 10.0, 6) as u_ros_3month
+from
+loghubods.alg_recsys_user_info
+where dt='{dt}'
+and length(mids) > 0
+and (u_3month_share_cnt > 0 or u_7day_click_cnt > 0 or u_3day_exp_cnt > 0)
+)
+SELECT
+u_id as k,
+*
+from candidate_user
+    """.format(dt=dt)
+    # log_.info(sql)
+    records = exe_sql(project, sql)
+    log_.info('sql_done')
+    records_process(records, process_and_store, max_size=50, num_workers=10)
+
+def timer_check(dt):
+    try:
+        project = config_.ad_model_data['ad_out_v1_user'].get('project')
+        table = config_.ad_model_data['ad_out_v1_user'].get('table')
+        now_date = datetime.datetime.today()
+        yesterday_date = now_date - datetime.timedelta(days=1)
+        now_dt = datetime.datetime.strftime(now_date, '%Y%m%d')
+        yesterday_dt = datetime.datetime.strftime(yesterday_date, '%Y%m%d')
+        log_.info(f"now_dt: {now_dt}")
+        if dt is not None:
+            yesterday_dt = dt
+        log_.info(f"update_dt: {yesterday_dt}")
+        now_min = datetime.datetime.now().minute
+        # 查看当前更新的数据是否已准备好
+        data_count = data_check(project=project, table=table, dt=yesterday_dt)
+        if data_count > 0:
+            log_.info('update_offline_score_user start! {}'.format(data_count))
+            # 数据准备好,进行更新
+            update_offline_score_user(dt=yesterday_dt)
+            log_.info('update_offline_score_user end!')
+        else:
+            # 数据没准备好,5分钟后重新检查
+            wait_seconds = 5 * 60
+            log_.info('data not ready, wait {}s'.format(wait_seconds))
+            Timer(wait_seconds, timer_check, args=(dt,)).start()
+
+    except Exception as e:
+        log_.error(f"用户广告跳出率预估离线user数据更新失败 exception: {e}, traceback: {traceback.format_exc()}")
+        send_msg_to_feishu(
+            webhook=config_.FEISHU_ROBOT['server_robot'].get('webhook'),
+            key_word=config_.FEISHU_ROBOT['server_robot'].get('key_word'),
+            msg_text=f"rov-offline{config_.ENV_TEXT} - 用户广告跳出率预估离线user数据更新失败\n"
+                     f"exception: {e}\n"
+                     f"traceback: {traceback.format_exc()}"
+        )
+
+
+if __name__ == "__main__":
+    dt = None
+    if len(sys.argv) > 1:
+        dt = sys.argv[1]
+        log_.info('## 手动更新:{}'.format(dt))
+    else:
+        log_.info('## 自动更新')
+    timer_check(dt)
+
+

+ 10 - 0
ad_out_v1_get_offline_score_user_v2.sh

@@ -0,0 +1,10 @@
+source /etc/profile
+echo $ROV_OFFLINE_ENV
+if [[ $ROV_OFFLINE_ENV == 'test' ]]; then
+    cd /data2/rov-offline &&
+    /root/anaconda3/bin/python /data2/rov-offline/ad_out_v1_get_offline_score_user_v2.py $*
+elif [[ $ROV_OFFLINE_ENV == 'pro' ]]; then
+    cd /data/rov-offline &&
+    /root/anaconda3/bin/python /data/rov-offline/ad_out_v1_get_offline_score_user_v2.py $*
+fi
+

+ 165 - 0
ad_out_v1_get_offline_score_user_v2_debug.py

@@ -0,0 +1,165 @@
+#coding utf-8
+import sys
+import json
+import datetime
+import traceback
+from threading import Timer
+from tqdm import tqdm
+from utils import RedisHelper, data_check, get_feature_data, send_msg_to_feishu
+from config import set_config
+from log import Log
+config_, _ = set_config()
+log_ = Log()
+redis_helper = RedisHelper()
+
+from feature import get_user_features
+from lr_model import LrModel
+from utils import exe_sql
+
+def update_offline_score_user(dt):
+    project = 'loghubods'
+    sql = """
+--odps sql
+--********************************************************************--
+--author:研发
+--create time:2023-12-11 23:54:20
+--********************************************************************--
+with candidate_user as (
+select
+-- 基础特征_用户
+mids AS u_id
+,machineinfo_brand AS u_brand
+,machineinfo_model AS u_device
+,SPLIT(machineinfo_system,' ')[0] AS u_system
+,machineinfo_system AS u_system_ver
+,province as ctx_region
+,city as ctx_city
+,u_cycle_bucket_7days
+,u_cycle_bucket_30days
+,u_share_bucket_30days
+,ceil(log2(u_1day_exp_cnt + 1)) as u_1day_exp_cnt
+,ceil(log2(u_1day_click_cnt + 1)) as u_1day_click_cnt
+,ceil(log2(u_1day_share_cnt + 1)) as u_1day_share_cnt
+,ceil(log2(u_1day_return_cnt + 1)) as u_1day_return_cnt
+,ceil(log2(u_3day_exp_cnt + 1)) as u_3day_exp_cnt
+,ceil(log2(u_3day_click_cnt + 1)) as u_3day_click_cnt
+,ceil(log2(u_3day_share_cnt + 1)) as u_3day_share_cnt
+,ceil(log2(u_3day_return_cnt + 1)) as u_3day_return_cnt
+,ceil(log2(u_7day_exp_cnt + 1)) as u_7day_exp_cnt
+,ceil(log2(u_7day_click_cnt + 1)) as u_7day_click_cnt
+,ceil(log2(u_7day_share_cnt + 1)) as u_7day_share_cnt
+,ceil(log2(u_7day_return_cnt + 1)) as u_7day_return_cnt
+,ceil(log2(u_3month_exp_cnt + 1)) as u_3month_exp_cnt
+,ceil(log2(u_3month_click_cnt + 1)) as u_3month_click_cnt
+,ceil(log2(u_3month_share_cnt + 1)) as u_3month_share_cnt
+,ceil(log2(u_3month_return_cnt + 1)) as u_3month_return_cnt
+,round(if(u_ctr_1day > 10.0, 10.0, u_ctr_1day) / 10.0, 6) as u_ctr_1day
+,round(if(u_str_1day > 10.0, 10.0, u_str_1day) / 10.0, 6) as u_str_1day
+,round(if(u_rov_1day > 10.0, 10.0, u_rov_1day) / 10.0, 6) as u_rov_1day
+,round(if(u_ros_1day > 10.0, 10.0, u_ros_1day) / 10.0, 6) as u_ros_1day
+,round(if(u_ctr_3day > 10.0, 10.0, u_ctr_3day) / 10.0, 6) as u_ctr_3day
+,round(if(u_str_3day > 10.0, 10.0, u_str_3day) / 10.0, 6) as u_str_3day
+,round(if(u_rov_3day > 10.0, 10.0, u_rov_3day) / 10.0, 6) as u_rov_3day
+,round(if(u_ros_3day > 10.0, 10.0, u_ros_3day) / 10.0, 6) as u_ros_3day
+,round(if(u_ctr_7day > 10.0, 10.0, u_ctr_7day) / 10.0, 6) as u_ctr_7day
+,round(if(u_str_7day > 10.0, 10.0, u_str_7day) / 10.0, 6) as u_str_7day
+,round(if(u_rov_7day > 10.0, 10.0, u_rov_7day) / 10.0, 6) as u_rov_7day
+,round(if(u_ros_7day > 10.0, 10.0, u_ros_7day) / 10.0, 6) as u_ros_7day
+,round(if(u_ctr_3month > 10.0, 10.0, u_ctr_3month) / 10.0, 6) as u_ctr_3month
+,round(if(u_str_3month > 10.0, 10.0, u_str_3month) / 10.0, 6) as u_str_3month
+,round(if(u_rov_3month > 10.0, 10.0, u_rov_3month) / 10.0, 6) as u_rov_3month
+,round(if(u_ros_3month > 10.0, 10.0, u_ros_3month) / 10.0, 6) as u_ros_3month
+from
+loghubods.alg_recsys_user_info
+where dt='{dt}'
+and length(mids) > 0
+and (u_3month_share_cnt > 0 or u_7day_click_cnt > 0 or u_3day_exp_cnt > 0)
+)
+SELECT
+u_id as k,
+*
+from candidate_user
+    """.format(dt=dt)
+    # log_.info(sql)
+    data = exe_sql(project, sql)
+    print('sql done')
+    # data.to_csv('./data/ad_out_sample_v2_item.{datetime}'.format(datetime=datetime), sep='\t')
+    # data = pd.read_csv('./data/ad_out_sample_v2_item.{datetime}'.format(datetime=datetime), sep='\t', dtype=str)
+    model_key = 'ad_out_v1'
+    lr_model = LrModel('model/{}.json'.format(model_key))
+    user_h_dict = {}
+    k_col = 'u_id'
+    key_name_prefix = f"{config_.KEY_NAME_PREFIX_AD_OUT_MODEL_SCORE_USER}{model_key}"
+    print(key_name_prefix)
+    mean_user_h = 0.0
+    count_user_h = 0
+    # 过期时间:一周
+    expire_time = 7 * 24 * 3600
+    with data.open_reader() as reader:
+        for row in tqdm(reader):
+            k = str(row['u_id'])
+            user_features = get_user_features(row)
+            user_h = lr_model.predict_h(user_features)
+            # redis_helper.set_data_to_redis(f"{key_name_prefix}:{k}", user_h, expire_time)
+            user_h_dict[k] = user_h
+            mean_user_h += user_h
+            count_user_h += 1
+            # print(user_features)
+            # print(user_h)
+    mean_user_h = mean_user_h / count_user_h 
+    user_h_dict['mean'] = mean_user_h 
+    print(mean_user_h)
+    print(count_user_h)
+    k = 'mean'
+    #redis_helper.set_data_to_redis(f"{key_name_prefix}:{k}", mean_user_h, expire_time)
+    with open('{}.{}.v2.json'.format(key_name_prefix, dt), 'w') as fout:
+        json.dump(user_h_dict, fout, indent=2, ensure_ascii=False, sort_keys=True)
+
+
+def timer_check(dt):
+    try:
+        project = config_.ad_model_data['ad_out_v1_user'].get('project')
+        table = config_.ad_model_data['ad_out_v1_user'].get('table')
+        now_date = datetime.datetime.today()
+        yesterday_date = now_date - datetime.timedelta(days=1)
+        now_dt = datetime.datetime.strftime(now_date, '%Y%m%d')
+        yesterday_dt = datetime.datetime.strftime(yesterday_date, '%Y%m%d')
+        log_.info(f"now_dt: {now_dt}")
+        if dt is not None:
+            yesterday_dt = dt
+        log_.info(f"update_dt: {yesterday_dt}")
+        now_min = datetime.datetime.now().minute
+        # 查看当前更新的数据是否已准备好
+        data_count = data_check(project=project, table=table, dt=yesterday_dt)
+        if data_count > 0:
+            log_.info('update_offline_score_user start! {}'.format(data_count))
+            # 数据准备好,进行更新
+            update_offline_score_user(dt=yesterday_dt)
+            log_.info('update_offline_score_user end!')
+        else:
+            # 数据没准备好,5分钟后重新检查
+            wait_seconds = 5 * 60
+            log_.info('data not ready, wait {}s'.format(wait_seconds))
+            Timer(wait_seconds, timer_check).start()
+
+    except Exception as e:
+        log_.error(f"用户广告跳出率预估离线user数据更新失败 exception: {e}, traceback: {traceback.format_exc()}")
+        send_msg_to_feishu(
+            webhook=config_.FEISHU_ROBOT['server_robot'].get('webhook'),
+            key_word=config_.FEISHU_ROBOT['server_robot'].get('key_word'),
+            msg_text=f"rov-offline{config_.ENV_TEXT} - 用户广告跳出率预估离线user数据更新失败\n"
+                     f"exception: {e}\n"
+                     f"traceback: {traceback.format_exc()}"
+        )
+
+
+if __name__ == "__main__":
+    dt = None
+    if len(sys.argv) > 1:
+        dt = sys.argv[1]
+        log_.info('## 手动更新:{}'.format(dt))
+    else:
+        log_.info('## 自动更新')
+    timer_check(dt)
+
+

+ 150 - 0
ad_out_v1_get_offline_score_user_v3.py

@@ -0,0 +1,150 @@
+#coding utf-8
+import sys
+import datetime
+import traceback
+from threading import Timer
+from tqdm import tqdm
+from utils import RedisHelper, data_check, get_feature_data, send_msg_to_feishu
+from config import set_config
+from log import Log
+from records_process import records_process
+
+config_, _ = set_config()
+log_ = Log()
+redis_helper = RedisHelper()
+
+from feature import get_user_features as get_features
+from lr_model import LrModel
+from utils import exe_sql
+
+model_key = 'ad_out_v1'
+lr_model = LrModel('model/{}.json'.format(model_key))
+item_h_dict = {}
+key_name_prefix = f"{config_.KEY_NAME_PREFIX_AD_OUT_MODEL_SCORE_USER}{model_key}"
+print(key_name_prefix)
+# 过期时间:一周
+expire_time = 7 * 24 * 3600
+
+def process_and_store(row):
+    k = str(row['k'])
+    features = get_features(row)
+    h = lr_model.predict_h(features)
+    redis_helper.set_data_to_redis(f"{key_name_prefix}:{k}", round(h, 6), expire_time)
+
+def update_offline_score_user(dt):
+    project = 'loghubods'
+    sql = """
+--odps sql 
+--********************************************************************--
+--author:研发
+--create time:2023-12-11 23:54:20
+--********************************************************************--
+with candidate_user as (
+select
+-- 基础特征_用户
+mids AS u_id
+,machineinfo_brand AS u_brand
+,machineinfo_model AS u_device
+,SPLIT(machineinfo_system,' ')[0] AS u_system
+,machineinfo_system AS u_system_ver
+,province as ctx_region
+,city as ctx_city
+,u_cycle_bucket_7days
+,u_cycle_bucket_30days
+,u_share_bucket_30days
+,ceil(log2(u_1day_exp_cnt + 1)) as u_1day_exp_cnt
+,ceil(log2(u_1day_click_cnt + 1)) as u_1day_click_cnt
+,ceil(log2(u_1day_share_cnt + 1)) as u_1day_share_cnt
+,ceil(log2(u_1day_return_cnt + 1)) as u_1day_return_cnt
+,ceil(log2(u_3day_exp_cnt + 1)) as u_3day_exp_cnt
+,ceil(log2(u_3day_click_cnt + 1)) as u_3day_click_cnt
+,ceil(log2(u_3day_share_cnt + 1)) as u_3day_share_cnt
+,ceil(log2(u_3day_return_cnt + 1)) as u_3day_return_cnt
+,ceil(log2(u_7day_exp_cnt + 1)) as u_7day_exp_cnt
+,ceil(log2(u_7day_click_cnt + 1)) as u_7day_click_cnt
+,ceil(log2(u_7day_share_cnt + 1)) as u_7day_share_cnt
+,ceil(log2(u_7day_return_cnt + 1)) as u_7day_return_cnt
+,ceil(log2(u_3month_exp_cnt + 1)) as u_3month_exp_cnt
+,ceil(log2(u_3month_click_cnt + 1)) as u_3month_click_cnt
+,ceil(log2(u_3month_share_cnt + 1)) as u_3month_share_cnt
+,ceil(log2(u_3month_return_cnt + 1)) as u_3month_return_cnt
+,round(if(u_ctr_1day > 10.0, 10.0, u_ctr_1day) / 10.0, 6) as u_ctr_1day
+,round(if(u_str_1day > 10.0, 10.0, u_str_1day) / 10.0, 6) as u_str_1day
+,round(if(u_rov_1day > 10.0, 10.0, u_rov_1day) / 10.0, 6) as u_rov_1day
+,round(if(u_ros_1day > 10.0, 10.0, u_ros_1day) / 10.0, 6) as u_ros_1day
+,round(if(u_ctr_3day > 10.0, 10.0, u_ctr_3day) / 10.0, 6) as u_ctr_3day
+,round(if(u_str_3day > 10.0, 10.0, u_str_3day) / 10.0, 6) as u_str_3day
+,round(if(u_rov_3day > 10.0, 10.0, u_rov_3day) / 10.0, 6) as u_rov_3day
+,round(if(u_ros_3day > 10.0, 10.0, u_ros_3day) / 10.0, 6) as u_ros_3day
+,round(if(u_ctr_7day > 10.0, 10.0, u_ctr_7day) / 10.0, 6) as u_ctr_7day
+,round(if(u_str_7day > 10.0, 10.0, u_str_7day) / 10.0, 6) as u_str_7day
+,round(if(u_rov_7day > 10.0, 10.0, u_rov_7day) / 10.0, 6) as u_rov_7day
+,round(if(u_ros_7day > 10.0, 10.0, u_ros_7day) / 10.0, 6) as u_ros_7day
+,round(if(u_ctr_3month > 10.0, 10.0, u_ctr_3month) / 10.0, 6) as u_ctr_3month
+,round(if(u_str_3month > 10.0, 10.0, u_str_3month) / 10.0, 6) as u_str_3month
+,round(if(u_rov_3month > 10.0, 10.0, u_rov_3month) / 10.0, 6) as u_rov_3month
+,round(if(u_ros_3month > 10.0, 10.0, u_ros_3month) / 10.0, 6) as u_ros_3month
+from
+loghubods.alg_recsys_user_info
+where dt='{dt}'
+and length(mids) > 0
+and u_1month_exp_cnt > 0
+)
+SELECT
+u_id as k,
+*
+from candidate_user
+order by rand()
+    """.format(dt=dt)
+    # log_.info(sql)
+    records = exe_sql(project, sql)
+    log_.info('sql_done')
+    records_process(records, process_and_store, max_size=50, num_workers=10)
+
+def timer_check(dt):
+    try:
+        project = config_.ad_model_data['ad_out_v1_user'].get('project')
+        table = config_.ad_model_data['ad_out_v1_user'].get('table')
+        now_date = datetime.datetime.today()
+        yesterday_date = now_date - datetime.timedelta(days=1)
+        now_dt = datetime.datetime.strftime(now_date, '%Y%m%d')
+        yesterday_dt = datetime.datetime.strftime(yesterday_date, '%Y%m%d')
+        log_.info(f"now_dt: {now_dt}")
+        if dt is not None:
+            yesterday_dt = dt
+        log_.info(f"update_dt: {yesterday_dt}")
+        now_min = datetime.datetime.now().minute
+        # 查看当前更新的数据是否已准备好
+        data_count = data_check(project=project, table=table, dt=yesterday_dt)
+        if data_count > 0:
+            log_.info('update_offline_score_user start! {}'.format(data_count))
+            # 数据准备好,进行更新
+            update_offline_score_user(dt=yesterday_dt)
+            log_.info('update_offline_score_user end!')
+        else:
+            # 数据没准备好,5分钟后重新检查
+            wait_seconds = 5 * 60
+            log_.info('data not ready, wait {}s'.format(wait_seconds))
+            Timer(wait_seconds, timer_check, args=(dt,)).start()
+
+    except Exception as e:
+        log_.error(f"用户广告跳出率预估离线user数据更新失败 exception: {e}, traceback: {traceback.format_exc()}")
+        send_msg_to_feishu(
+            webhook=config_.FEISHU_ROBOT['server_robot'].get('webhook'),
+            key_word=config_.FEISHU_ROBOT['server_robot'].get('key_word'),
+            msg_text=f"rov-offline{config_.ENV_TEXT} - 用户广告跳出率预估离线user数据更新失败\n"
+                     f"exception: {e}\n"
+                     f"traceback: {traceback.format_exc()}"
+        )
+
+
+if __name__ == "__main__":
+    dt = None
+    if len(sys.argv) > 1:
+        dt = sys.argv[1]
+        log_.info('## 手动更新:{}'.format(dt))
+    else:
+        log_.info('## 自动更新')
+    timer_check(dt)
+
+

+ 10 - 0
ad_out_v1_get_offline_score_user_v3.sh

@@ -0,0 +1,10 @@
+source /etc/profile
+echo $ROV_OFFLINE_ENV
+if [[ $ROV_OFFLINE_ENV == 'test' ]]; then
+    cd /data2/rov-offline &&
+    /root/anaconda3/bin/python /data2/rov-offline/ad_out_v1_get_offline_score_user_v3.py $*
+elif [[ $ROV_OFFLINE_ENV == 'pro' ]]; then
+    cd /data/rov-offline &&
+    /root/anaconda3/bin/python /data/rov-offline/ad_out_v1_get_offline_score_user_v3.py $*
+fi
+

+ 165 - 0
ad_out_v1_get_offline_score_user_v3_debug.py

@@ -0,0 +1,165 @@
+#coding utf-8
+import sys
+import json
+import datetime
+import traceback
+from threading import Timer
+from tqdm import tqdm
+from utils import RedisHelper, data_check, get_feature_data, send_msg_to_feishu
+from config import set_config
+from log import Log
+config_, _ = set_config()
+log_ = Log()
+redis_helper = RedisHelper()
+
+from feature import get_user_features
+from lr_model import LrModel
+from utils import exe_sql
+
+def update_offline_score_user(dt):
+    project = 'loghubods'
+    sql = """
+--odps sql
+--********************************************************************--
+--author:研发
+--create time:2023-12-11 23:54:20
+--********************************************************************--
+with candidate_user as (
+select
+-- 基础特征_用户
+mids AS u_id
+,machineinfo_brand AS u_brand
+,machineinfo_model AS u_device
+,SPLIT(machineinfo_system,' ')[0] AS u_system
+,machineinfo_system AS u_system_ver
+,province as ctx_region
+,city as ctx_city
+,u_cycle_bucket_7days
+,u_cycle_bucket_30days
+,u_share_bucket_30days
+,ceil(log2(u_1day_exp_cnt + 1)) as u_1day_exp_cnt
+,ceil(log2(u_1day_click_cnt + 1)) as u_1day_click_cnt
+,ceil(log2(u_1day_share_cnt + 1)) as u_1day_share_cnt
+,ceil(log2(u_1day_return_cnt + 1)) as u_1day_return_cnt
+,ceil(log2(u_3day_exp_cnt + 1)) as u_3day_exp_cnt
+,ceil(log2(u_3day_click_cnt + 1)) as u_3day_click_cnt
+,ceil(log2(u_3day_share_cnt + 1)) as u_3day_share_cnt
+,ceil(log2(u_3day_return_cnt + 1)) as u_3day_return_cnt
+,ceil(log2(u_7day_exp_cnt + 1)) as u_7day_exp_cnt
+,ceil(log2(u_7day_click_cnt + 1)) as u_7day_click_cnt
+,ceil(log2(u_7day_share_cnt + 1)) as u_7day_share_cnt
+,ceil(log2(u_7day_return_cnt + 1)) as u_7day_return_cnt
+,ceil(log2(u_3month_exp_cnt + 1)) as u_3month_exp_cnt
+,ceil(log2(u_3month_click_cnt + 1)) as u_3month_click_cnt
+,ceil(log2(u_3month_share_cnt + 1)) as u_3month_share_cnt
+,ceil(log2(u_3month_return_cnt + 1)) as u_3month_return_cnt
+,round(if(u_ctr_1day > 10.0, 10.0, u_ctr_1day) / 10.0, 6) as u_ctr_1day
+,round(if(u_str_1day > 10.0, 10.0, u_str_1day) / 10.0, 6) as u_str_1day
+,round(if(u_rov_1day > 10.0, 10.0, u_rov_1day) / 10.0, 6) as u_rov_1day
+,round(if(u_ros_1day > 10.0, 10.0, u_ros_1day) / 10.0, 6) as u_ros_1day
+,round(if(u_ctr_3day > 10.0, 10.0, u_ctr_3day) / 10.0, 6) as u_ctr_3day
+,round(if(u_str_3day > 10.0, 10.0, u_str_3day) / 10.0, 6) as u_str_3day
+,round(if(u_rov_3day > 10.0, 10.0, u_rov_3day) / 10.0, 6) as u_rov_3day
+,round(if(u_ros_3day > 10.0, 10.0, u_ros_3day) / 10.0, 6) as u_ros_3day
+,round(if(u_ctr_7day > 10.0, 10.0, u_ctr_7day) / 10.0, 6) as u_ctr_7day
+,round(if(u_str_7day > 10.0, 10.0, u_str_7day) / 10.0, 6) as u_str_7day
+,round(if(u_rov_7day > 10.0, 10.0, u_rov_7day) / 10.0, 6) as u_rov_7day
+,round(if(u_ros_7day > 10.0, 10.0, u_ros_7day) / 10.0, 6) as u_ros_7day
+,round(if(u_ctr_3month > 10.0, 10.0, u_ctr_3month) / 10.0, 6) as u_ctr_3month
+,round(if(u_str_3month > 10.0, 10.0, u_str_3month) / 10.0, 6) as u_str_3month
+,round(if(u_rov_3month > 10.0, 10.0, u_rov_3month) / 10.0, 6) as u_rov_3month
+,round(if(u_ros_3month > 10.0, 10.0, u_ros_3month) / 10.0, 6) as u_ros_3month
+from
+loghubods.alg_recsys_user_info
+where dt='{dt}'
+and length(mids) > 0
+and u_1month_exp_cnt > 0
+)
+SELECT
+u_id as k,
+*
+from candidate_user
+    """.format(dt=dt)
+    # log_.info(sql)
+    data = exe_sql(project, sql)
+    print('sql done')
+    # data.to_csv('./data/ad_out_sample_v2_item.{datetime}'.format(datetime=datetime), sep='\t')
+    # data = pd.read_csv('./data/ad_out_sample_v2_item.{datetime}'.format(datetime=datetime), sep='\t', dtype=str)
+    model_key = 'ad_out_v1'
+    lr_model = LrModel('model/{}.json'.format(model_key))
+    user_h_dict = {}
+    k_col = 'u_id'
+    key_name_prefix = f"{config_.KEY_NAME_PREFIX_AD_OUT_MODEL_SCORE_USER}{model_key}"
+    print(key_name_prefix)
+    mean_user_h = 0.0
+    count_user_h = 0
+    # 过期时间:一周
+    expire_time = 7 * 24 * 3600
+    with data.open_reader() as reader:
+        for row in tqdm(reader):
+            k = str(row['u_id'])
+            user_features = get_user_features(row)
+            user_h = lr_model.predict_h(user_features)
+            # redis_helper.set_data_to_redis(f"{key_name_prefix}:{k}", user_h, expire_time)
+            user_h_dict[k] = user_h
+            mean_user_h += user_h
+            count_user_h += 1
+            # print(user_features)
+            # print(user_h)
+    mean_user_h = mean_user_h / count_user_h 
+    user_h_dict['mean'] = mean_user_h 
+    print(mean_user_h)
+    print(count_user_h)
+    k = 'mean'
+    #redis_helper.set_data_to_redis(f"{key_name_prefix}:{k}", mean_user_h, expire_time)
+    with open('{}.{}.v3.json'.format(key_name_prefix, dt), 'w') as fout:
+        json.dump(user_h_dict, fout, indent=2, ensure_ascii=False, sort_keys=True)
+
+
+def timer_check(dt):
+    try:
+        project = config_.ad_model_data['ad_out_v1_user'].get('project')
+        table = config_.ad_model_data['ad_out_v1_user'].get('table')
+        now_date = datetime.datetime.today()
+        yesterday_date = now_date - datetime.timedelta(days=1)
+        now_dt = datetime.datetime.strftime(now_date, '%Y%m%d')
+        yesterday_dt = datetime.datetime.strftime(yesterday_date, '%Y%m%d')
+        log_.info(f"now_dt: {now_dt}")
+        if dt is not None:
+            yesterday_dt = dt
+        log_.info(f"update_dt: {yesterday_dt}")
+        now_min = datetime.datetime.now().minute
+        # 查看当前更新的数据是否已准备好
+        data_count = data_check(project=project, table=table, dt=yesterday_dt)
+        if data_count > 0:
+            log_.info('update_offline_score_user start! {}'.format(data_count))
+            # 数据准备好,进行更新
+            update_offline_score_user(dt=yesterday_dt)
+            log_.info('update_offline_score_user end!')
+        else:
+            # 数据没准备好,5分钟后重新检查
+            wait_seconds = 5 * 60
+            log_.info('data not ready, wait {}s'.format(wait_seconds))
+            Timer(wait_seconds, timer_check).start()
+
+    except Exception as e:
+        log_.error(f"用户广告跳出率预估离线user数据更新失败 exception: {e}, traceback: {traceback.format_exc()}")
+        send_msg_to_feishu(
+            webhook=config_.FEISHU_ROBOT['server_robot'].get('webhook'),
+            key_word=config_.FEISHU_ROBOT['server_robot'].get('key_word'),
+            msg_text=f"rov-offline{config_.ENV_TEXT} - 用户广告跳出率预估离线user数据更新失败\n"
+                     f"exception: {e}\n"
+                     f"traceback: {traceback.format_exc()}"
+        )
+
+
+if __name__ == "__main__":
+    dt = None
+    if len(sys.argv) > 1:
+        dt = sys.argv[1]
+        log_.info('## 手动更新:{}'.format(dt))
+    else:
+        log_.info('## 自动更新')
+    timer_check(dt)
+
+

+ 27 - 15
ad_out_v1_set_config.py

@@ -11,20 +11,32 @@ redis_helper = RedisHelper()
 
 if __name__ == '__main__':
     model_key = 'ad_out_v1'
-    abtest_id = '173'
-    abtest_config_tag_list = ['u', 'v']
-    for abtest_config_tag in abtest_config_tag_list :
+    configs = {
+        '173-u': {
+            'threshold': 0.26,
+            'item_threshold': 0.3957,
+            'user_threshold': 0.3272,
+            'miss_threshold': 0.0,
+            'use_backup': 'true'
+        },
+        '173-v': {
+            'threshold': 0.273169,
+            'item_threshold': 0.402544,
+            'user_threshold': 0.334571,
+            'miss_threshold': 0.0,
+            'use_backup': 'false'
+        },
+    }
+    # abtest_id = '173'
+    # abtest_config_tag_list = ['u', 'v']
+    expire_time = 30 * 24 * 3600
+    for abtest_key, abtest_config in configs.items():
+        print(abtest_key)
+        abtest_id, abtest_config_tag = abtest_key.split('-')
         config_key_prefix = f"{config_.KEY_NAME_PREFIX_AD_OUT_MODEL_CONFIG}{model_key}:{abtest_id}:{abtest_config_tag}"
-        threshold_key = f"{config_key_prefix}:threshold"
-        use_mean_key = f"{config_key_prefix}:use_mean"
-        print(threshold_key)
-        threshold = redis_helper.get_data_from_redis(key_name=threshold_key)
-        use_mean = redis_helper.get_data_from_redis(key_name=use_mean_key)
-        print('threshold:', threshold, type(threshold))
-        print('use_mean:', use_mean, type(use_mean))
-        expire_time = 30 * 24 * 3600
-        redis_helper.set_data_to_redis(threshold_key, 0.27, expire_time)
-        redis_helper.set_data_to_redis(use_mean_key, 'false', expire_time)
-        print(threshold_key)
-        print(use_mean_key)
+        for k, v in abtest_config.items():
+            k_key = f"{config_key_prefix}:{k}"
+            k_value = redis_helper.get_data_from_redis(key_name=k_key)
+            print(k_key, k_value, type(k_value))
+            redis_helper.set_data_to_redis(k_key, v, expire_time)
 

+ 2 - 2
config.py

@@ -208,8 +208,8 @@ class BaseConfig(object):
             APP_TYPE['VLOG']: 0.3,
             APP_TYPE['LOVE_LIVE']: 0.2,
             APP_TYPE['LONG_VIDEO']: 0.2,
-            APP_TYPE['SHORT_VIDEO']: 0.1,
-            # APP_TYPE['WAN_NENG_VIDEO']: 1,
+            APP_TYPE['SHORT_VIDEO']: 0.05,
+            APP_TYPE['WAN_NENG_VIDEO']: 0.05,
             # APP_TYPE['LAO_HAO_KAN_VIDEO']: 1,
             # APP_TYPE['ZUI_JING_QI']: 1,
             APP_TYPE['APP']: 0.05,

+ 21 - 0
recommend_region_data_status_update.py

@@ -0,0 +1,21 @@
+import datetime
+from config import set_config
+from log import Log
+from db_helper import RedisHelper
+
+config_, _ = set_config()
+log_ = Log()
+
+now_date = datetime.datetime.today()
+log_.info(f"now_date: {datetime.datetime.strftime(now_date, '%Y%m%d%H')}")
+redis_helper = RedisHelper()
+redis_helper.set_data_to_redis(
+    key_name=f"{config_.RULE_24H_DATA_STATUS}:{datetime.datetime.strftime(now_date, '%Y%m%d%H')}", value='0', expire_time=2 * 3600
+)
+redis_helper.set_data_to_redis(
+    key_name=f"{config_.REGION_24H_DATA_STATUS}:{datetime.datetime.strftime(now_date, '%Y%m%d%H')}", value='0', expire_time=2 * 3600
+)
+redis_helper.set_data_to_redis(
+    key_name=f"{config_.RULE_H_DATA_STATUS}:{datetime.datetime.strftime(now_date, '%Y%m%d%H')}", value='0', expire_time=2 * 3600
+)
+log_.info(f"recommend data status update to initial '0' finished!")

+ 39 - 0
records_process.py

@@ -0,0 +1,39 @@
+#coding utf-8
+import threading
+from concurrent.futures import ThreadPoolExecutor
+from queue import Queue
+from tqdm import tqdm
+
+def worker(queue, executor):
+    while True:
+        row = queue.get()
+        if row is None:  # 结束信号
+            queue.task_done()
+            break
+        executor(row)
+        queue.task_done()
+
+def records_process(records, executor, max_size=50, num_workers=10):
+    # 创建一个线程安全的队列
+    queue = Queue(maxsize=max_size)  # 可以调整 maxsize 以控制内存使用
+    # 设置线程池大小
+    num_workers = num_workers 
+    # 启动工作线程
+    threads = []
+    for _ in range(num_workers):
+        t = threading.Thread(target=worker, args=(queue, executor))
+        t.start()
+        threads.append(t)
+    # 读取数据并放入队列
+    with records.open_reader() as reader:
+        for row in tqdm(reader):
+            queue.put(row)
+    # 发送结束信号
+    for _ in range(num_workers):
+        queue.put(None)
+    # 等待所有任务完成
+    queue.join()
+    # 等待所有工作线程结束
+    for t in threads:
+        t.join()
+

+ 111 - 10
region_rule_rank_h.py

@@ -7,6 +7,7 @@
 import multiprocessing
 import os
 import sys
+import time
 import traceback
 
 import gevent
@@ -672,11 +673,14 @@ def video_rank(df, now_date, now_h, rule_key, param, region, data_key, rule_rank
     by_48h_rule_key = param.get('48h_rule_key', None)
     dup_remove = param.get('dup_remove', True)
     # 与其他召回视频池去重,存入对应的redis
-    dup_to_redis(h_video_ids=h_video_ids, now_date=now_date, now_h=now_h, rule_key=rule_key, h_rule_key=h_rule_key,
-                 region_24h_rule_key=region_24h_rule_key, by_24h_rule_key=by_24h_rule_key,
-                 by_48h_rule_key=by_48h_rule_key, region=region, data_key=data_key,
-                 rule_rank_h_flag=rule_rank_h_flag, political_filter=political_filter,
-                 shield_config=shield_config, dup_remove=dup_remove)
+    dup_to_redis_with_timecheck(
+        h_video_ids=h_video_ids, now_date=now_date, now_h=now_h, rule_key=rule_key, h_rule_key=h_rule_key,
+        region_24h_rule_key=region_24h_rule_key, by_24h_rule_key=by_24h_rule_key,
+        by_48h_rule_key=by_48h_rule_key, region=region, data_key=data_key,
+        rule_rank_h_flag=rule_rank_h_flag, political_filter=political_filter,
+        shield_config=shield_config, dup_remove=dup_remove
+    )
+    # log_.info(f"==============")
 
 
 def dup_data(h_video_ids, initial_key_name, dup_key_name, region, political_filter, shield_config, dup_remove):
@@ -793,6 +797,101 @@ def dup_to_redis(h_video_ids, now_date, now_h, rule_key, h_rule_key, region_24h_
     #                        dup_key_name=model_data_dup_key_name, region=region)
 
 
+def dup_to_redis_with_timecheck(h_video_ids, now_date, now_h, rule_key, h_rule_key, region_24h_rule_key,
+                                by_24h_rule_key, by_48h_rule_key, region, data_key, rule_rank_h_flag,
+                                political_filter, shield_config, dup_remove):
+    """将地域分组小时级数据与其他召回视频池去重,存入对应的redis"""
+    # 获取并判断其他数据表更新状态
+    redis_helper = RedisHelper()
+    while True:
+        rule_24h_status = redis_helper.get_data_from_redis(key_name=f"{config_.RULE_24H_DATA_STATUS}:{datetime.datetime.strftime(now_date, '%Y%m%d%H')}")
+        region_24h_status = redis_helper.get_data_from_redis(key_name=f"{config_.REGION_24H_DATA_STATUS}:{datetime.datetime.strftime(now_date, '%Y%m%d%H')}")
+        rule_h_status = redis_helper.get_data_from_redis(key_name=f"{config_.RULE_H_DATA_STATUS}:{datetime.datetime.strftime(now_date, '%Y%m%d%H')}")
+        if rule_24h_status == '1' and region_24h_status == '1' and rule_h_status == '1':
+            # log_.info("dup data start ....")
+            # ##### 去重更新不区分地域小时级列表,并另存为redis中
+            if h_rule_key is not None:
+                h_key_name = \
+                    f"{config_.RECALL_KEY_NAME_PREFIX_BY_H_H}{data_key}:{h_rule_key}:" \
+                    f"{datetime.datetime.strftime(now_date, '%Y%m%d')}:{now_h}"
+                h_dup_key_name = \
+                    f"{config_.RECALL_KEY_NAME_PREFIX_DUP_H_H}{region}:{data_key}:{rule_key}:" \
+                    f"{datetime.datetime.strftime(now_date, '%Y%m%d')}:{now_h}"
+                h_video_ids = dup_data(h_video_ids=h_video_ids, initial_key_name=h_key_name,
+                                       dup_key_name=h_dup_key_name, region=region, political_filter=political_filter,
+                                       shield_config=shield_config, dup_remove=dup_remove)
+
+            # ##### 去重更新地域分组小时级24h列表,并另存为redis中
+            region_24h_key_name = \
+                f"{config_.RECALL_KEY_NAME_PREFIX_REGION_BY_24H}{region}:{data_key}:{region_24h_rule_key}:" \
+                f"{datetime.datetime.strftime(now_date, '%Y%m%d')}:{now_h}"
+            region_24h_dup_key_name = \
+                f"{config_.RECALL_KEY_NAME_PREFIX_DUP1_REGION_24H_H}{region}:{data_key}:{rule_key}:" \
+                f"{datetime.datetime.strftime(now_date, '%Y%m%d')}:{now_h}"
+            h_video_ids = dup_data(h_video_ids=h_video_ids, initial_key_name=region_24h_key_name,
+                                   dup_key_name=region_24h_dup_key_name, region=region, political_filter=political_filter,
+                                   shield_config=shield_config, dup_remove=dup_remove)
+
+            if rule_rank_h_flag == '48h':
+
+                # ##### 去重小程序相对48h更新结果,并另存为redis中
+                h_48h_key_name = f"{config_.RECALL_KEY_NAME_PREFIX_BY_48H}{data_key}:{by_48h_rule_key}:" \
+                                 f"{datetime.datetime.strftime(now_date, '%Y%m%d')}:{now_h}"
+                h_48h_dup_key_name = \
+                    f"{config_.RECALL_KEY_NAME_PREFIX_DUP2_REGION_48H_H}{region}:{data_key}:{rule_key}:" \
+                    f"{datetime.datetime.strftime(now_date, '%Y%m%d')}:{now_h}"
+                h_video_ids = dup_data(h_video_ids=h_video_ids, initial_key_name=h_48h_key_name,
+                                       dup_key_name=h_48h_dup_key_name, region=region, political_filter=political_filter,
+                                       shield_config=shield_config, dup_remove=dup_remove)
+
+                # ##### 去重小程序相对48h 筛选后剩余数据 更新结果,并另存为redis中
+                if by_48h_rule_key == 'rule1':
+                    other_h_48h_key_name = f"{config_.RECALL_KEY_NAME_PREFIX_BY_48H_OTHER}{data_key}:" \
+                                           f"{by_48h_rule_key}:{datetime.datetime.strftime(now_date, '%Y%m%d')}:{now_h}"
+                    other_h_48h_dup_key_name = \
+                        f"{config_.RECALL_KEY_NAME_PREFIX_DUP3_REGION_48H_H}{region}:{data_key}:{rule_key}:" \
+                        f"{datetime.datetime.strftime(now_date, '%Y%m%d')}:{now_h}"
+                    h_video_ids = dup_data(h_video_ids=h_video_ids, initial_key_name=other_h_48h_key_name,
+                                           dup_key_name=other_h_48h_dup_key_name, region=region,
+                                           political_filter=political_filter, shield_config=shield_config,
+                                           dup_remove=dup_remove)
+
+            else:
+                # ##### 去重小程序相对24h更新结果,并另存为redis中
+                h_24h_key_name = f"{config_.RECALL_KEY_NAME_PREFIX_BY_24H}{data_key}:{by_24h_rule_key}:" \
+                                 f"{datetime.datetime.strftime(now_date, '%Y%m%d')}:{now_h}"
+                h_24h_dup_key_name = \
+                    f"{config_.RECALL_KEY_NAME_PREFIX_DUP2_REGION_24H_H}{region}:{data_key}:{rule_key}:" \
+                    f"{datetime.datetime.strftime(now_date, '%Y%m%d')}:{now_h}"
+                h_video_ids = dup_data(h_video_ids=h_video_ids, initial_key_name=h_24h_key_name,
+                                       dup_key_name=h_24h_dup_key_name, region=region, political_filter=political_filter,
+                                       shield_config=shield_config, dup_remove=dup_remove)
+
+                # ##### 去重小程序相对24h 筛选后剩余数据 更新结果,并另存为redis中
+                # if by_24h_rule_key in ['rule3', 'rule4']:
+                other_h_24h_key_name = f"{config_.RECALL_KEY_NAME_PREFIX_BY_24H_OTHER}{data_key}:" \
+                                       f"{by_24h_rule_key}:{datetime.datetime.strftime(now_date, '%Y%m%d')}:{now_h}"
+                other_h_24h_dup_key_name = \
+                    f"{config_.RECALL_KEY_NAME_PREFIX_DUP3_REGION_24H_H}{region}:{data_key}:{rule_key}:" \
+                    f"{datetime.datetime.strftime(now_date, '%Y%m%d')}:{now_h}"
+                h_video_ids = dup_data(h_video_ids=h_video_ids, initial_key_name=other_h_24h_key_name,
+                                       dup_key_name=other_h_24h_dup_key_name, region=region,
+                                       political_filter=political_filter,
+                                       shield_config=shield_config, dup_remove=dup_remove)
+            break
+        else:
+            # 数据没准备好,1分钟后重新检查
+            # log_.info("dup data wait ....")
+            time.sleep(60)
+            # Timer(
+            #     60,
+            #     dup_to_redis_with_timecheck,
+            #     args=[h_video_ids, now_date, now_h, rule_key, h_rule_key, region_24h_rule_key,
+            #           by_24h_rule_key, by_48h_rule_key, region, data_key, rule_rank_h_flag,
+            #           political_filter, shield_config, dup_remove]
+            # ).start()
+
+
 def merge_df(df_left, df_right):
     """
     df按照videoid, code 合并,对应特征求和
@@ -1118,11 +1217,13 @@ def h_bottom_process(param, rule_params_item, region_code_list, key_prefix, redi
         if len(final_data) > 0:
             redis_helper.add_data_with_zset(key_name=final_key_name, data=final_data, expire_time=2 * 24 * 3600)
         # 与其他召回视频池去重,存入对应的redis
-        dup_to_redis(h_video_ids=h_video_ids, now_date=now_date, now_h=now_h, rule_key=rule_key, h_rule_key=h_rule_key,
-                     region_24h_rule_key=region_24h_rule_key, region=region,
-                     data_key=data_key, by_24h_rule_key=by_24h_rule_key,
-                     by_48h_rule_key=by_48h_rule_key, rule_rank_h_flag=rule_rank_h_flag,
-                     political_filter=political_filter, shield_config=shield_config, dup_remove=dup_remove)
+        dup_to_redis_with_timecheck(
+            h_video_ids=h_video_ids, now_date=now_date, now_h=now_h, rule_key=rule_key, h_rule_key=h_rule_key,
+            region_24h_rule_key=region_24h_rule_key, region=region,
+            data_key=data_key, by_24h_rule_key=by_24h_rule_key,
+            by_48h_rule_key=by_48h_rule_key, rule_rank_h_flag=rule_rank_h_flag,
+            political_filter=political_filter, shield_config=shield_config, dup_remove=dup_remove
+        )
     # 特殊城市视频数据准备
     for region, city_list in config_.REGION_CITY_MAPPING.items():
         t = [

+ 13 - 4
region_rule_rank_h_by24h.py

@@ -167,7 +167,7 @@ def video_rank(df, now_date, now_h, rule_key, param, region, data_key):
     h_recall_df['videoid'] = h_recall_df['videoid'].astype(int)
     h_recall_videos = h_recall_df['videoid'].to_list()
     log_.info(f'h_recall_videos count = {len(h_recall_videos)}')
-    log_.info('h_recall_videos:{}'.format('-'.join([str(i) for i in h_recall_videos])))
+    # log_.info('h_recall_videos:{}'.format('-'.join([str(i) for i in h_recall_videos])))
 
     # 视频状态过滤
     if data_key in ['data7', ]:
@@ -187,9 +187,9 @@ def video_rank(df, now_date, now_h, rule_key, param, region, data_key):
     day_recall_key_name = \
         f"{config_.RECALL_KEY_NAME_PREFIX_REGION_BY_24H}{region}:{data_key}:{rule_key}:" \
         f"{datetime.datetime.strftime(now_date, '%Y%m%d')}:{now_h}"
-    log_.info("day_recall_result.type:{}".format(str(type(day_recall_result))))
-    log_.info("begin to write redis for day_recall_key_name:{} with {}".format(day_recall_key_name,
-                                                                               str(len(day_recall_result))))
+    # log_.info("day_recall_result.type:{}".format(str(type(day_recall_result))))
+    # log_.info("begin to write redis for day_recall_key_name:{} with {}".format(day_recall_key_name,
+    #                                                                            str(len(day_recall_result))))
     if len(day_recall_result) > 0:
         redis_helper.add_data_with_zset(key_name=day_recall_key_name, data=day_recall_result, expire_time=2 * 3600)
         # 清空线上过滤应用列表
@@ -487,6 +487,7 @@ def h_timer_check():
         now_h = datetime.datetime.now().hour
         now_min = datetime.datetime.now().minute
         log_.info(f"now_date: {datetime.datetime.strftime(now_date, '%Y%m%d%H')}")
+        redis_helper = RedisHelper()
         # 查看当天更新的数据是否已准备好
         h_data_count = data_check(project=project, table=table, now_date=now_date)
         if h_data_count > 0:
@@ -495,10 +496,18 @@ def h_timer_check():
             rank_by_24h(now_date=now_date, now_h=now_h, rule_params=rule_params,
                         project=project, table=table, region_code_list=region_code_list)
             log_.info(f"region_24h_data end!")
+            redis_helper.set_data_to_redis(
+                key_name=f"{config_.REGION_24H_DATA_STATUS}:{datetime.datetime.strftime(now_date, '%Y%m%d%H')}", value='1', expire_time=2 * 3600
+            )
+            log_.info(f"region_24h_data status update to '1' finished!")
         elif now_min > 40:
             log_.info('24h_recall data is None, use bottom data!')
             h_rank_bottom(now_date=now_date, now_h=now_h, rule_params=rule_params, region_code_list=region_code_list)
             log_.info(f"region_24h_data end!")
+            redis_helper.set_data_to_redis(
+                key_name=f"{config_.REGION_24H_DATA_STATUS}:{datetime.datetime.strftime(now_date, '%Y%m%d%H')}", value='1', expire_time=2 * 3600
+            )
+            log_.info(f"region_24h_data status update to '1' finished!")
         else:
             # 数据没准备好,1分钟后重新检查
             Timer(60, h_timer_check).start()

+ 24 - 8
region_rule_rank_h_task.sh

@@ -1,17 +1,33 @@
 source /etc/profile
 echo $ROV_OFFLINE_ENV
 if [[ $ROV_OFFLINE_ENV == 'test' ]]; then
-    cd /data2/rov-offline && /root/anaconda3/bin/python /data2/rov-offline/rule_rank_h_by_24h.py &&
-     /root/anaconda3/bin/python /data2/rov-offline/region_rule_rank_h_by24h.py &&
-     /root/anaconda3/bin/python /data2/rov-offline/rule_rank_h_new.py &&
-      /root/anaconda3/bin/python /data2/rov-offline/region_rule_rank_h.py '24h'
+    cd /data2/rov-offline
+    /root/anaconda3/bin/python /data2/rov-offline/recommend_region_data_status_update.py
+    echo "recommend data status update to initial '0' finished!"
+    nohup /root/anaconda3/bin/python /data2/rov-offline/rule_rank_h_by_24h.py &
+    nohup /root/anaconda3/bin/python /data2/rov-offline/region_rule_rank_h_by24h.py &
+    nohup /root/anaconda3/bin/python /data2/rov-offline/rule_rank_h_new.py &
+#    while ps aux | grep "rule_rank_h_by_24h.py" | grep -v grep > /dev/null || ps aux | grep "region_rule_rank_h_by24h.py" | grep -v grep > /dev/null || ps aux | grep "rule_rank_h_new.py" | grep -v grep > /dev/null;
+#    do
+#      sleep 30
+#    done
+#    echo "24h, region_24h, h data update task finished!"
+    nohup /root/anaconda3/bin/python /data2/rov-offline/region_rule_rank_h.py '24h' &
 #      /root/anaconda3/bin/python /data2/rov-offline/region_rule_rank_h_new.py
 #      /root/anaconda3/bin/python /data2/rov-offline/laohaokan_recommend_update.py
 elif [[ $ROV_OFFLINE_ENV == 'pro' ]]; then
-    cd /data/rov-offline && /root/anaconda3/bin/python /data/rov-offline/rule_rank_h_by_24h.py &&
-     /root/anaconda3/bin/python /data/rov-offline/region_rule_rank_h_by24h.py &&
-     /root/anaconda3/bin/python /data/rov-offline/rule_rank_h_new.py &&
-      /root/anaconda3/bin/python /data/rov-offline/region_rule_rank_h.py '24h'
+    cd /data/rov-offline
+    /root/anaconda3/bin/python /data/rov-offline/recommend_region_data_status_update.py
+    echo "recommend data status update to initial '0' finished!"
+    nohup /root/anaconda3/bin/python /data/rov-offline/rule_rank_h_by_24h.py &
+    nohup /root/anaconda3/bin/python /data/rov-offline/region_rule_rank_h_by24h.py &
+    nohup /root/anaconda3/bin/python /data/rov-offline/rule_rank_h_new.py &
+#    while ps aux | grep "rule_rank_h_by_24h.py" | grep -v grep > /dev/null || ps aux | grep "region_rule_rank_h_by24h.py" | grep -v grep > /dev/null || ps aux | grep "rule_rank_h_new.py" | grep -v grep > /dev/null;
+#    do
+#      sleep 30
+#    done
+#    echo "24h, region_24h, h data update task finished!"
+    nohup /root/anaconda3/bin/python /data/rov-offline/region_rule_rank_h.py '24h' &
 #      /root/anaconda3/bin/python /data/rov-offline/region_rule_rank_h_new.py
 #      /root/anaconda3/bin/python /data/rov-offline/laohaokan_recommend_update.py
 fi

+ 100 - 37
rule_rank_h_by_24h.py

@@ -1,4 +1,7 @@
+import time
+
 import pandas as pd
+import multiprocessing
 import math
 import traceback
 from functools import reduce
@@ -196,7 +199,7 @@ def video_rank_h(df, now_date, now_h, rule_key, param, data_key, notify_backend)
         filtered_videos = filter_video_status_app(day_recall_videos)
     else:
         filtered_videos = filter_video_status(day_recall_videos)
-    log_.info('filtered_videos count = {}'.format(len(filtered_videos)))
+    # log_.info('filtered_videos count = {}'.format(len(filtered_videos)))
 
     # 写入对应的redis
     now_dt = datetime.strftime(now_date, '%Y%m%d')
@@ -211,7 +214,7 @@ def video_rank_h(df, now_date, now_h, rule_key, param, data_key, notify_backend)
 
     h_24h_recall_key_name = \
         f"{config_.RECALL_KEY_NAME_PREFIX_BY_24H}{data_key}:{rule_key}:{now_dt}:{now_h}"
-    log_.info("h_24h_recall_key_name:redis:{}".format(h_24h_recall_key_name))
+    # log_.info("h_24h_recall_key_name:redis:{}".format(h_24h_recall_key_name))
     if len(day_recall_result) > 0:
         log_.info(f"count = {len(day_recall_result)}, key = {h_24h_recall_key_name}")
         redis_helper.add_data_with_zset(key_name=h_24h_recall_key_name, data=day_recall_result, expire_time=2 * 3600)
@@ -222,11 +225,12 @@ def video_rank_h(df, now_date, now_h, rule_key, param, data_key, notify_backend)
     all_videos = df['videoid'].to_list()
     log_.info(f'h_by24h_recall all videos count = {len(all_videos)}')
     # 视频状态过滤
+    st_time = time.time()
     if data_key in ['data7', ]:
         all_filtered_videos = filter_video_status_app(all_videos)
     else:
         all_filtered_videos = filter_video_status(all_videos)
-    log_.info(f'all_filtered_videos count = {len(all_filtered_videos)}')
+    log_.info(f'all_filtered_videos count = {len(all_filtered_videos)}, param = {param}, execute_time = {int(time.time() - st_time)*1000}ms')
     # 与筛选结果去重
     other_videos = [video for video in all_filtered_videos if video not in day_video_ids]
     log_.info(f'other_videos count = {len(other_videos)}')
@@ -303,6 +307,43 @@ def merge_df_with_score(df_left, df_right):
     return df_merged[feature_list]
 
 
+def process_with_param(param, data_params_item, rule_params_item, feature_df, now_date, now_h):
+    log_.info(f"param = {param} start...")
+    score_df_list = []
+    notify_backend = param.get('notify_backend', False)
+    data_key = param.get('data')
+    data_param = data_params_item.get(data_key)
+    log_.info(f"data_key = {data_key}, data_param = {data_param}")
+    rule_key = param.get('rule')
+    rule_param = rule_params_item.get(rule_key)
+    log_.info(f"rule_key = {rule_key}, rule_param = {rule_param}")
+    # cal_score_func = rule_param.get('cal_score_func', 1)
+    merge_func = rule_param.get('merge_func', 1)
+
+    if merge_func == 2:
+        for apptype, weight in data_param.items():
+            df = feature_df[feature_df['apptype'] == apptype]
+            # 计算score
+            score_df = cal_score(df=df, param=rule_param)
+            score_df['score'] = score_df['score'] * weight
+            score_df_list.append(score_df)
+        # 分数合并
+        df_merged = reduce(merge_df_with_score, score_df_list)
+        # 更新平台回流比
+        df_merged['platform_return_rate'] = df_merged['platform_return'] / df_merged['回流人数']
+        video_rank_h(df=df_merged, now_date=now_date, now_h=now_h,
+                     rule_key=rule_key, param=rule_param, data_key=data_key,
+                     notify_backend=notify_backend)
+    else:
+        df_list = [feature_df[feature_df['apptype'] == apptype] for apptype, _ in data_param.items()]
+        df_merged = reduce(merge_df, df_list)
+        score_df = cal_score(df=df_merged, param=rule_param)
+        video_rank_h(df=score_df, now_date=now_date, now_h=now_h,
+                     rule_key=rule_key, param=rule_param, data_key=data_key,
+                     notify_backend=notify_backend)
+    log_.info(f"param = {param} end!")
+
+
 def rank_by_h(now_date, now_h, rule_params, project, table):
     # 获取特征数据
     feature_df = get_feature_data(now_date=now_date, now_h=now_h, project=project, table=table)
@@ -330,40 +371,49 @@ def rank_by_h(now_date, now_h, rule_params, project, table):
         video_rank_h(df=score_df, now_date=now_date, now_h=now_h,
                      rule_key=rule_key, param=rule_param, data_key=data_key)
     """
-
-    for param in rule_params.get('params_list'):
-        score_df_list = []
-        notify_backend = param.get('notify_backend', False)
-        data_key = param.get('data')
-        data_param = data_params_item.get(data_key)
-        log_.info(f"data_key = {data_key}, data_param = {data_param}")
-        rule_key = param.get('rule')
-        rule_param = rule_params_item.get(rule_key)
-        log_.info(f"rule_key = {rule_key}, rule_param = {rule_param}")
-        # cal_score_func = rule_param.get('cal_score_func', 1)
-        merge_func = rule_param.get('merge_func', 1)
-
-        if merge_func == 2:
-            for apptype, weight in data_param.items():
-                df = feature_df[feature_df['apptype'] == apptype]
-                # 计算score
-                score_df = cal_score(df=df, param=rule_param)
-                score_df['score'] = score_df['score'] * weight
-                score_df_list.append(score_df)
-            # 分数合并
-            df_merged = reduce(merge_df_with_score, score_df_list)
-            # 更新平台回流比
-            df_merged['platform_return_rate'] = df_merged['platform_return'] / df_merged['回流人数']
-            video_rank_h(df=df_merged, now_date=now_date, now_h=now_h,
-                         rule_key=rule_key, param=rule_param, data_key=data_key,
-                         notify_backend=notify_backend)
-        else:
-            df_list = [feature_df[feature_df['apptype'] == apptype] for apptype, _ in data_param.items()]
-            df_merged = reduce(merge_df, df_list)
-            score_df = cal_score(df=df_merged, param=rule_param)
-            video_rank_h(df=score_df, now_date=now_date, now_h=now_h,
-                         rule_key=rule_key, param=rule_param, data_key=data_key,
-                         notify_backend=notify_backend)
+    params_list = rule_params.get('params_list')
+    pool = multiprocessing.Pool(processes=len(params_list))
+    for param in params_list:
+        pool.apply_async(
+            func=process_with_param,
+            args=(param, data_params_item, rule_params_item, feature_df, now_date, now_h)
+        )
+    pool.close()
+    pool.join()
+
+    # for param in rule_params.get('params_list'):
+    #     score_df_list = []
+    #     notify_backend = param.get('notify_backend', False)
+    #     data_key = param.get('data')
+    #     data_param = data_params_item.get(data_key)
+    #     log_.info(f"data_key = {data_key}, data_param = {data_param}")
+    #     rule_key = param.get('rule')
+    #     rule_param = rule_params_item.get(rule_key)
+    #     log_.info(f"rule_key = {rule_key}, rule_param = {rule_param}")
+    #     # cal_score_func = rule_param.get('cal_score_func', 1)
+    #     merge_func = rule_param.get('merge_func', 1)
+    #
+    #     if merge_func == 2:
+    #         for apptype, weight in data_param.items():
+    #             df = feature_df[feature_df['apptype'] == apptype]
+    #             # 计算score
+    #             score_df = cal_score(df=df, param=rule_param)
+    #             score_df['score'] = score_df['score'] * weight
+    #             score_df_list.append(score_df)
+    #         # 分数合并
+    #         df_merged = reduce(merge_df_with_score, score_df_list)
+    #         # 更新平台回流比
+    #         df_merged['platform_return_rate'] = df_merged['platform_return'] / df_merged['回流人数']
+    #         video_rank_h(df=df_merged, now_date=now_date, now_h=now_h,
+    #                      rule_key=rule_key, param=rule_param, data_key=data_key,
+    #                      notify_backend=notify_backend)
+    #     else:
+    #         df_list = [feature_df[feature_df['apptype'] == apptype] for apptype, _ in data_param.items()]
+    #         df_merged = reduce(merge_df, df_list)
+    #         score_df = cal_score(df=df_merged, param=rule_param)
+    #         video_rank_h(df=score_df, now_date=now_date, now_h=now_h,
+    #                      rule_key=rule_key, param=rule_param, data_key=data_key,
+    #                      notify_backend=notify_backend)
 
     #     # to-csv
     #     score_filename = f"score_by24h_{key}_{datetime.strftime(now_date, '%Y%m%d%H')}.csv"
@@ -439,21 +489,34 @@ def h_timer_check():
         log_.info(f"now_date: {datetime.strftime(now_date, '%Y%m%d%H')}")
         now_min = datetime.now().minute
         now_h = datetime.now().hour
+        redis_helper = RedisHelper()
         # 查看当前天级更新的数据是否已准备好
         h_data_count = h_data_check(project=project, table=table, now_date=now_date, now_h=now_h)
         if now_h == 23 or now_h < 8:
             log_.info(f'now_h = {now_h} use bottom data!')
             h_rank_bottom(now_date=now_date, now_h=now_h, rule_params=rule_params)
             log_.info(f"24h_data end!")
+            redis_helper.set_data_to_redis(
+                key_name=f"{config_.RULE_24H_DATA_STATUS}:{datetime.strftime(now_date, '%Y%m%d%H')}", value='1', expire_time=2 * 3600
+            )
+            log_.info(f"rule_24h_data status update to '1' finished!")
         elif h_data_count > 0:
             log_.info(f'h_by24h_data_count = {h_data_count}')
             # 数据准备好,进行更新
             rank_by_h(now_date=now_date, now_h=now_h, rule_params=rule_params, project=project, table=table)
             log_.info(f"24h_data end!")
+            redis_helper.set_data_to_redis(
+                key_name=f"{config_.RULE_24H_DATA_STATUS}:{datetime.strftime(now_date, '%Y%m%d%H')}", value='1', expire_time=2 * 3600
+            )
+            log_.info(f"rule_24h_data status update to '1' finished!")
         elif now_min > 40:
             log_.info('h_by24h_recall data is None, use bottom data!')
             h_rank_bottom(now_date=now_date, now_h=now_h, rule_params=rule_params)
             log_.info(f"24h_data end!")
+            redis_helper.set_data_to_redis(
+                key_name=f"{config_.RULE_24H_DATA_STATUS}:{datetime.strftime(now_date, '%Y%m%d%H')}", value='1', expire_time=2 * 3600
+            )
+            log_.info(f"rule_24h_data status update to '1' finished!")
         else:
             # 数据没准备好,1分钟后重新检查
             Timer(60, h_timer_check).start()

+ 13 - 0
rule_rank_h_new.py

@@ -263,11 +263,16 @@ def h_timer_check():
         log_.info(f"now_date: {datetime.strftime(now_date, '%Y%m%d%H')}")
         now_min = datetime.now().minute
         now_h = datetime.now().hour
+        redis_helper = RedisHelper()
 
         if now_h == 0:
             log_.info(f'now_h = {now_h} use bottom data!')
             h_rank_bottom(now_date=now_date, now_h=now_h, rule_params=rule_params)
             log_.info(f"h_data end!")
+            redis_helper.set_data_to_redis(
+                key_name=f"{config_.RULE_H_DATA_STATUS}:{datetime.strftime(now_date, '%Y%m%d%H')}", value='1', expire_time=2 * 3600
+            )
+            log_.info(f"rule_h_data status update to '1' finished!")
             return
         # 查看当前小时级更新的数据是否已准备好
         h_data_count = h_data_check(project=project, table=table, now_date=now_date)
@@ -276,10 +281,18 @@ def h_timer_check():
             # 数据准备好,进行更新
             rank_by_h(now_date=now_date, now_h=now_h, rule_params=rule_params, project=project, table=table)
             log_.info(f"h_data end!")
+            redis_helper.set_data_to_redis(
+                key_name=f"{config_.RULE_H_DATA_STATUS}:{datetime.strftime(now_date, '%Y%m%d%H')}", value='1', expire_time=2 * 3600
+            )
+            log_.info(f"rule_h_data status update to '1' finished!")
         elif now_min > 40:
             log_.info('h_recall data is None, use bottom data!')
             h_rank_bottom(now_date=now_date, now_h=now_h, rule_params=rule_params)
             log_.info(f"h_data end!")
+            redis_helper.set_data_to_redis(
+                key_name=f"{config_.RULE_H_DATA_STATUS}:{datetime.strftime(now_date, '%Y%m%d%H')}", value='1', expire_time=2 * 3600
+            )
+            log_.info(f"rule_h_data status update to '1' finished!")
         else:
             # 数据没准备好,1分钟后重新检查
             Timer(60, h_timer_check).start()