瀏覽代碼

Merge branch 'feature/zhangbo_flow_recall' of algorithm/rov-offline into master

不区分app权重的地域召回子策略离线数据生产
zhangbo 1 年之前
父節點
當前提交
5fb4188999
共有 4 個文件被更改,包括 52 次插入8 次删除
  1. 31 2
      config.py
  2. 7 3
      region_rule_rank_h_by24h.py
  3. 13 0
      region_rule_rank_h_task_zhangbo.sh
  4. 1 3
      rule_rank_h_by_24h.py

+ 31 - 2
config.py

@@ -203,6 +203,19 @@ class BaseConfig(object):
         'data14': {APP_TYPE['VLOG']: 0.78, APP_TYPE['LOVE_LIVE']: 0.11, APP_TYPE['SHORT_VIDEO']: 0.08,
                    APP_TYPE['LONG_VIDEO']: 0.03},
         'videos5': {APP_TYPE['LONG_VIDEO']: 1},  # [内容精选]
+        'data66': {
+            APP_TYPE['VLOG']: 0.3,
+            APP_TYPE['LOVE_LIVE']: 0.2,
+            APP_TYPE['LONG_VIDEO']: 0.2,
+            APP_TYPE['SHORT_VIDEO']: 0.1,
+            # APP_TYPE['WAN_NENG_VIDEO']: 1,
+            # APP_TYPE['LAO_HAO_KAN_VIDEO']: 1,
+            # APP_TYPE['ZUI_JING_QI']: 1,
+            APP_TYPE['APP']: 0.05,
+            APP_TYPE['PIAO_QUAN_VIDEO_PLUS']: 0.05,
+            APP_TYPE['JOURNEY']: 0.05,
+            APP_TYPE['BLESSING_YEAR']: 0.05
+        },
 
     }
 
@@ -249,6 +262,8 @@ class BaseConfig(object):
                       'view_type': 'preview'},
             'rule4': {'cal_score_func': 2, 'return_count': 100, 'platform_return_rate': 0.001,
                       'view_type': 'preview', 'merge_func': 2},
+            'rule66': {'cal_score_func': 2, 'return_count': 100, 'platform_return_rate': 0.001,
+                      'view_type': 'preview'},
             # # 无回流人群
             # 'rule5': {'return_count': 100, 'platform_return_rate': 0.001,
             #           'view_type': 'preview', 'click_score_rate': 0.7},
@@ -281,6 +296,7 @@ class BaseConfig(object):
             # {'data': 'data1', 'rule': 'rule7'},
             # {'data': 'data1', 'rule': 'rule8'},
             {'data': 'videos5', 'rule': 'rule4'},  # [内容精选]
+            {'data': 'data66', 'rule': 'rule66'},
         ]
     }
 
@@ -299,6 +315,8 @@ class BaseConfig(object):
                       'platform_return_rate': 0.001, 'merge_func': 2},
             'rule5': {'view_type': 'preview', 'return_count': 21, 'score_rule': 0,
                       'platform_return_rate': 0.001, 'merge_func': 2},
+            'rule66': {'view_type': 'video-show', 'return_count': 21, 'score_rule': 0,
+                      'platform_return_rate': 0.001},
             # # 无回流人群
             # 'rule6': {'view_type': 'video-show', 'return_count': 21, 'score_rule': 0,
             #           'platform_return_rate': 0.001, 'click_score_rate': 0.7},
@@ -331,6 +349,7 @@ class BaseConfig(object):
             # {'data': 'data1', 'rule': 'rule8'},
             # {'data': 'data1', 'rule': 'rule9'},
             {'data': 'videos5', 'rule': 'rule4'},  # [内容精选]
+            {'data': 'data66', 'rule': 'rule66'},
         ]
     }
 
@@ -470,8 +489,10 @@ class BaseConfig(object):
             'rule29': {'view_type': 'video-show-region', 'platform_return_rate': 0.001,
                        'region_24h_rule_key': 'rule4', '24h_rule_key': 'rule4', 'merge_func': 2,
                        'score_func': 'back_rate_rank_weighting'},
-
-
+            'rule66': {
+                'view_type': 'video-show-region', 'platform_return_rate': 0.001,
+                'region_24h_rule_key': 'rule66', '24h_rule_key': 'rule66'
+            },
         },
         'data_params': DATA_PARAMS,
         'params_list': [
@@ -513,6 +534,7 @@ class BaseConfig(object):
             {'data': 'data10', 'rule': 'rule28'},  # 503
             # {'data': 'data10', 'rule': 'rule29'},  # 509
             {'data': 'data10', 'rule': 'rule30'},  # 510
+            {'data': 'data66', 'rule': 'rule66'}, # 520
         ],
         'params_list_new': [
             # {'data': 'data10', 'rule': 'rule19'},  # 316 票圈视频 + 召回在线去重
@@ -2545,6 +2567,12 @@ class TestConfig(BaseConfig):
         'password': 'lv_manager@2020',
         'db': 'longvideo',
         'charset': 'utf8'
+        # 'host': 'am-bp15tqt957i3b3sgi131950.ads.aliyuncs.com',
+        # 'port': 3306,
+        # 'user': 'lv_manager',
+        # 'password': 'lv_manager@2020',
+        # 'db': 'longvideo',
+        # 'charset': 'utf8'
     }
 
     # 日志服务配置
@@ -2759,6 +2787,7 @@ class ProductionConfig(BaseConfig):
 def set_config():
     # 获取环境变量 ROV_OFFLINE_ENV
     env = os.environ.get('ROV_OFFLINE_ENV')
+    print("ROV_OFFLINE_ENV:{}".format(str(env)))
     # env = 'dev'
     if env is None:
         # log_.error('ENV ERROR: is None!')

+ 7 - 3
region_rule_rank_h_by24h.py

@@ -160,19 +160,21 @@ def video_rank(df, now_date, now_h, rule_key, param, region, data_key):
     platform_return_rate = param.get('platform_return_rate', 0)
     h_recall_df = df[(df['lastday_return'] >= return_count) & (df['score'] >= score_value)
                      & (df['platform_return_rate'] >= platform_return_rate)]
+    log_.info(f'h_recall_df count = {len(h_recall_df)}')
     # videoid重复时,保留分值高
     h_recall_df = h_recall_df.sort_values(by=['score'], ascending=False)
     h_recall_df = h_recall_df.drop_duplicates(subset=['videoid'], keep='first')
     h_recall_df['videoid'] = h_recall_df['videoid'].astype(int)
     h_recall_videos = h_recall_df['videoid'].to_list()
-    # log_.info(f'day_recall videos count = {len(h_recall_videos)}')
+    log_.info(f'h_recall_videos count = {len(h_recall_videos)}')
+    log_.info('h_recall_videos:{}'.format('-'.join([str(i) for i in h_recall_videos])))
 
     # 视频状态过滤
     if data_key in ['data7', ]:
         filtered_videos = filter_video_status_app(h_recall_videos)
     else:
         filtered_videos = filter_video_status(h_recall_videos)
-    # log_.info('filtered_videos count = {}'.format(len(filtered_videos)))
+    log_.info('filtered_videos count = {}'.format(len(filtered_videos)))
 
     # 写入对应的redis
     h_video_ids = []
@@ -182,10 +184,12 @@ def video_rank(df, now_date, now_h, rule_key, param, region, data_key):
         # print(score)
         day_recall_result[int(video_id)] = float(score)
         h_video_ids.append(int(video_id))
-
     day_recall_key_name = \
         f"{config_.RECALL_KEY_NAME_PREFIX_REGION_BY_24H}{region}:{data_key}:{rule_key}:" \
         f"{datetime.datetime.strftime(now_date, '%Y%m%d')}:{now_h}"
+    log_.info("day_recall_result.type:{}".format(str(type(day_recall_result))))
+    log_.info("begin to write redis for day_recall_key_name:{} with {}".format(day_recall_key_name,
+                                                                               str(len(day_recall_result))))
     if len(day_recall_result) > 0:
         redis_helper.add_data_with_zset(key_name=day_recall_key_name, data=day_recall_result, expire_time=2 * 3600)
         # 清空线上过滤应用列表

+ 13 - 0
region_rule_rank_h_task_zhangbo.sh

@@ -0,0 +1,13 @@
+source /etc/profile
+echo $ROV_OFFLINE_ENV
+if [[ $ROV_OFFLINE_ENV == 'test' ]]; then
+    cd /root/zhangbo/rov-offline && /root/anaconda3/bin/python rule_rank_h_by_24h.py &&
+     /root/anaconda3/bin/python region_rule_rank_h_by24h.py &&
+     /root/anaconda3/bin/python rule_rank_h_new.py &&
+      /root/anaconda3/bin/python region_rule_rank_h.py '24h'
+elif [[ $ROV_OFFLINE_ENV == 'pro' ]]; then
+    cd /root/zhangbo/rov-offline && /root/anaconda3/bin/python rule_rank_h_by_24h.py &&
+     /root/anaconda3/bin/python region_rule_rank_h_by24h.py &&
+     /root/anaconda3/bin/python rule_rank_h_new.py &&
+      /root/anaconda3/bin/python region_rule_rank_h.py '24h'
+fi

+ 1 - 3
rule_rank_h_by_24h.py

@@ -191,7 +191,6 @@ def video_rank_h(df, now_date, now_h, rule_key, param, data_key, notify_backend)
     day_recall_df = day_recall_df[day_recall_df['platform_return_rate'] > platform_return_rate]
     day_recall_videos = day_recall_df['videoid'].to_list()
     log_.info(f'h_by24h_recall videos count = {len(day_recall_videos)}')
-
     # 视频状态过滤
     if data_key in ['data7', ]:
         filtered_videos = filter_video_status_app(day_recall_videos)
@@ -212,7 +211,7 @@ def video_rank_h(df, now_date, now_h, rule_key, param, data_key, notify_backend)
 
     h_24h_recall_key_name = \
         f"{config_.RECALL_KEY_NAME_PREFIX_BY_24H}{data_key}:{rule_key}:{now_dt}:{now_h}"
-
+    log_.info("h_24h_recall_key_name:redis:{}".format(h_24h_recall_key_name))
     if len(day_recall_result) > 0:
         log_.info(f"count = {len(day_recall_result)}, key = {h_24h_recall_key_name}")
         redis_helper.add_data_with_zset(key_name=h_24h_recall_key_name, data=day_recall_result, expire_time=2 * 3600)
@@ -358,7 +357,6 @@ def rank_by_h(now_date, now_h, rule_params, project, table):
             video_rank_h(df=df_merged, now_date=now_date, now_h=now_h,
                          rule_key=rule_key, param=rule_param, data_key=data_key,
                          notify_backend=notify_backend)
-
         else:
             df_list = [feature_df[feature_df['apptype'] == apptype] for apptype, _ in data_param.items()]
             df_merged = reduce(merge_df, df_list)