Browse Source

Merge branch 'remove-rov-list-ab' into pre-master

liqian 2 years ago
parent
commit
9940b52eb6
5 changed files with 103 additions and 15 deletions
  1. 17 0
      check_video_limit_distribute.py
  2. 19 7
      config.py
  3. 34 3
      region_rule_rank_h.py
  4. 32 5
      rule_rank_h_by_24h.py
  5. 1 0
      videos_filter.py

+ 17 - 0
check_video_limit_distribute.py

@@ -114,6 +114,23 @@ def process_with_region(app_type, data_key, rule_key, region, stop_distribute_vi
         values=stop_distribute_video_id_list,
         expire_time=2 * 3600
     )
+    # 将已超分发视频 移除 不区分相对24h列表2
+    if rule_key == 'rule4':
+        key_name = f"{config_.RECALL_KEY_NAME_PREFIX_DUP3_REGION_24H_H}" \
+                   f"{region}.{app_type}.{data_key}.{rule_key}." \
+                   f"{datetime.datetime.strftime(now_date, '%Y%m%d')}.{now_h}"
+        if not redis_helper.key_exists(key_name=key_name):
+            if now_h == 0:
+                redis_date = now_date - datetime.timedelta(days=1)
+                redis_h = 23
+            else:
+                redis_date = now_date
+                redis_h = now_h - 1
+            key_name = f"{config_.RECALL_KEY_NAME_PREFIX_DUP3_REGION_24H_H}" \
+                       f"{region}.{app_type}.{data_key}.{rule_key}." \
+                       f"{datetime.datetime.strftime(redis_date, '%Y%m%d')}.{redis_h}"
+        redis_helper.remove_value_from_zset(key_name=key_name, value=stop_distribute_video_id_list)
+
     # 将已超分发视频 移除 大列表
     key_name = f"{config_.RECALL_KEY_NAME_PREFIX_DUP_REGION_H}" \
                f"{region}.{app_type}.{data_key}.{rule_key}." \

+ 19 - 7
config.py

@@ -167,6 +167,8 @@ class BaseConfig(object):
             'rule_params': {
                 'rule2': {'cal_score_func': 2, 'return_count': 40, 'platform_return_rate': 0.001,
                           'view_type': 'preview'},
+                'rule3': {'cal_score_func': 2, 'return_count': 100, 'platform_return_rate': 0.001,
+                          'view_type': 'preview'},
             },
             'data_params': {
                 'data1': [APP_TYPE['VLOG'], ],
@@ -175,6 +177,7 @@ class BaseConfig(object):
             'params_list': [
                 {'data': 'data1', 'rule': 'rule2'},
                 {'data': 'data2', 'rule': 'rule2'},
+                {'data': 'data1', 'rule': 'rule3'},
             ],
         },
         APP_TYPE['LONG_VIDEO']: {
@@ -388,7 +391,9 @@ class BaseConfig(object):
             'rule_params': {
                 # 'rule2': {'view_type': 'video-show', 'platform_return_rate': 0.001, 'region_24h_rule_key': 'rule2'},
                 'rule3': {'view_type': 'video-show-region', 'platform_return_rate': 0.001,
-                          'region_24h_rule_key': 'rule2'},
+                          'region_24h_rule_key': 'rule2', '24h_rule_key': 'rule2'},
+                'rule4': {'view_type': 'video-show-region', 'platform_return_rate': 0.001,
+                          'region_24h_rule_key': 'rule2', '24h_rule_key': 'rule3'},
             },
             'data_params': {
                 'data1': [APP_TYPE['VLOG'], ],
@@ -397,13 +402,14 @@ class BaseConfig(object):
             'params_list': [
                 {'data': 'data1', 'rule': 'rule3'},
                 {'data': 'data2', 'rule': 'rule3'},
+                {'data': 'data1', 'rule': 'rule4'},
             ],
         },
         APP_TYPE['LONG_VIDEO']: {
             'rule_params': {
                 # 'rule2': {'view_type': 'video-show', 'platform_return_rate': 0.001, 'region_24h_rule_key': 'rule2'},
                 'rule3': {'view_type': 'video-show-region', 'platform_return_rate': 0.001,
-                          'region_24h_rule_key': 'rule2'},
+                          'region_24h_rule_key': 'rule2', '24h_rule_key': 'rule2'},
             },
             'data_params': {
                 'data1': [APP_TYPE['VLOG'], ],
@@ -421,7 +427,7 @@ class BaseConfig(object):
             'rule_params': {
                 # 'rule2': {'view_type': 'video-show', 'platform_return_rate': 0.001, 'region_24h_rule_key': 'rule2'},
                 'rule3': {'view_type': 'video-show-region', 'platform_return_rate': 0.001,
-                          'region_24h_rule_key': 'rule2'},
+                          'region_24h_rule_key': 'rule2', '24h_rule_key': 'rule2'},
             },
             'data_params': {
                 'data1': [APP_TYPE['VLOG'], ],
@@ -434,7 +440,7 @@ class BaseConfig(object):
             'rule_params': {
                 # 'rule2': {'view_type': 'video-show', 'platform_return_rate': 0.001, 'region_24h_rule_key': 'rule2'},
                 'rule3': {'view_type': 'video-show-region', 'platform_return_rate': 0.001,
-                          'region_24h_rule_key': 'rule2'},
+                          'region_24h_rule_key': 'rule2', '24h_rule_key': 'rule2'},
             },
             'data_params': {
                 'data1': [APP_TYPE['VLOG'], ],
@@ -447,7 +453,7 @@ class BaseConfig(object):
             'rule_params': {
                 # 'rule2': {'view_type': 'video-show', 'platform_return_rate': 0.001, 'region_24h_rule_key': 'rule2'},
                 'rule3': {'view_type': 'video-show-region', 'platform_return_rate': 0.001,
-                          'region_24h_rule_key': 'rule2'},
+                          'region_24h_rule_key': 'rule2', '24h_rule_key': 'rule2'},
             },
             'data_params': {
                 'data1': [APP_TYPE['VLOG'], ],
@@ -460,7 +466,7 @@ class BaseConfig(object):
             'rule_params': {
                 # 'rule2': {'view_type': 'video-show', 'platform_return_rate': 0.001, 'region_24h_rule_key': 'rule2'},
                 'rule3': {'view_type': 'video-show-region', 'platform_return_rate': 0.001,
-                          'region_24h_rule_key': 'rule2'},
+                          'region_24h_rule_key': 'rule2', '24h_rule_key': 'rule2'},
             },
             'data_params': {
                 'data1': [APP_TYPE['VLOG'], ],
@@ -478,7 +484,7 @@ class BaseConfig(object):
             'rule_params': {
                 # 'rule2': {'view_type': 'video-show', 'platform_return_rate': 0.001, 'region_24h_rule_key': 'rule2'},
                 'rule3': {'view_type': 'video-show-region', 'platform_return_rate': 0.001,
-                          'region_24h_rule_key': 'rule2'},
+                          'region_24h_rule_key': 'rule2', '24h_rule_key': 'rule2'},
             },
             'data_params': {
                 'data1': [APP_TYPE['VLOG'], ],
@@ -525,6 +531,9 @@ class BaseConfig(object):
     # 小程序小时级24h数据更新结果存放 redis key前缀,
     # 完整格式:com.weiqu.video.recall.item.score.apptype.24h.{appType}.{data_key}.{rule_key}.{date}.{h}
     RECALL_KEY_NAME_PREFIX_BY_24H = 'com.weiqu.video.recall.item.score.apptype.24h.'
+    # 小程序小时级24h数据 筛选后的剩余数据 更新结果存放 redis key前缀,
+    # 完整格式:com.weiqu.video.recall.item.score.apptype.24h.other.{appType}.{data_key}.{rule_key}.{date}.{h}
+    RECALL_KEY_NAME_PREFIX_BY_24H_OTHER = 'com.weiqu.video.recall.item.score.apptype.24h.other.'
     # 小程序离线ROV模型结果与小程序小时级24h更新结果去重后 存放 redis key前缀,
     # 完整格式:com.weiqu.video.recall.hot.item.score.dup.24h.{rule_key}.{date}.{h}
     RECALL_KEY_NAME_PREFIX_DUP_24H = 'com.weiqu.video.recall.hot.item.score.dup.24h.'
@@ -546,6 +555,9 @@ class BaseConfig(object):
     # 小程序24h更新结果与 小程序地域分组24h更新结果/小程序地域分组小时级更新结果 去重后 存放 redis key前缀,
     # 完整格式:com.weiqu.video.recall.hot.item.score.dup2.apptype.region.24h.h.{region}.{appType}.{data_key}.{rule_key}.{date}.{h}
     RECALL_KEY_NAME_PREFIX_DUP2_REGION_24H_H = 'com.weiqu.video.recall.hot.item.score.dup2.apptype.region.24h.h.'
+    # 小程序小时级24h数据 筛选后的剩余数据 更新结果 与 小程序24h更新结果/小程序地域分组24h更新结果/小程序地域分组小时级更新结果 去重后 存放 redis key前缀,
+    # 完整格式:com.weiqu.video.recall.hot.item.score.dup3.apptype.region.24h.h.{region}.{appType}.{data_key}.{rule_key}.{date}.{h}
+    RECALL_KEY_NAME_PREFIX_DUP3_REGION_24H_H = 'com.weiqu.video.recall.hot.item.score.dup3.apptype.region.24h.h.'
     # 小程序离线ROV模型结果与 小程序天级更新结果/小程序地域分组天级更新结果/小程序地域分组小时级更新结果 去重后 存放 redis key前缀,
     # 完整格式:com.weiqu.video.recall.hot.item.score.dup.apptype.region.h.{region}.{appType}.{data_key}.{rule_key}.{date}.{h}
     RECALL_KEY_NAME_PREFIX_DUP_REGION_H = 'com.weiqu.video.recall.hot.item.score.dup.apptype.region.h.'

+ 34 - 3
region_rule_rank_h.py

@@ -182,12 +182,14 @@ def video_rank(df, now_date, now_h, rule_key, param, region, app_type, data_key)
         redis_helper.del_keys(key_name=f"{config_.REGION_H_VIDEO_FILER}{region}.{app_type}.{data_key}.{rule_key}")
 
     region_24h_rule_key = param.get('region_24h_rule_key', 'rule1')
+    by_24h_rule_key = param.get('24h_rule_key', None)
     # 与其他召回视频池去重,存入对应的redis
     dup_to_redis(h_video_ids=h_video_ids, now_date=now_date, now_h=now_h, rule_key=rule_key,
-                 region_24h_rule_key=region_24h_rule_key, region=region, app_type=app_type, data_key=data_key)
+                 region_24h_rule_key=region_24h_rule_key, by_24h_rule_key=by_24h_rule_key,
+                 region=region, app_type=app_type, data_key=data_key)
 
 
-def dup_to_redis(h_video_ids, now_date, now_h, rule_key, region_24h_rule_key, region, app_type, data_key):
+def dup_to_redis(h_video_ids, now_date, now_h, rule_key, region_24h_rule_key, by_24h_rule_key, region, app_type, data_key):
     """将地域分组小时级数据与其他召回视频池去重,存入对应的redis"""
     redis_helper = RedisHelper()
     # # ##### 去重更新地域分组天级列表,并另存为redis中
@@ -260,7 +262,7 @@ def dup_to_redis(h_video_ids, now_date, now_h, rule_key, region_24h_rule_key, re
     #         redis_helper.add_data_with_zset(key_name=day_dup_key_name, data=day_dup, expire_time=23 * 3600)
 
     # ##### 去重小程序相对24h更新结果,并另存为redis中
-    day_key_name = f"{config_.RECALL_KEY_NAME_PREFIX_BY_24H}{app_type}.{data_key}.rule2." \
+    day_key_name = f"{config_.RECALL_KEY_NAME_PREFIX_BY_24H}{app_type}.{data_key}.{by_24h_rule_key}." \
                    f"{datetime.datetime.strftime(now_date, '%Y%m%d')}.{now_h}"
     if redis_helper.key_exists(key_name=day_key_name):
         day_data = redis_helper.get_all_data_from_zset(key_name=day_key_name, with_scores=True)
@@ -289,6 +291,35 @@ def dup_to_redis(h_video_ids, now_date, now_h, rule_key, region_24h_rule_key, re
             # 清空线上过滤应用列表
             redis_helper.del_keys(key_name=f"{config_.H_VIDEO_FILER_24H}{region}.{app_type}.{data_key}.{rule_key}")
 
+    # ##### 去重小程序相对24h 筛选后剩余数据 更新结果,并另存为redis中
+    if by_24h_rule_key == 'rule3':
+        other_h_24h_recall_key_name = f"{config_.RECALL_KEY_NAME_PREFIX_BY_24H_OTHER}{app_type}.{data_key}." \
+                                      f"{by_24h_rule_key}.{datetime.datetime.strftime(now_date, '%Y%m%d')}.{now_h}"
+        if redis_helper.key_exists(key_name=other_h_24h_recall_key_name):
+            other_24h_data = redis_helper.get_all_data_from_zset(key_name=other_h_24h_recall_key_name, with_scores=True)
+            log_.info(f'24h other data count = {len(other_24h_data)}')
+
+            # 屏蔽视频过滤
+            other_24h_video_ids = [int(video_id) for video_id, _ in other_24h_data]
+            shield_key_name_list = config_.SHIELD_CONFIG.get(region, None)
+            if shield_key_name_list is not None:
+                other_24h_video_ids = filter_shield_video(video_ids=other_24h_video_ids, shield_key_name_list=shield_key_name_list)
+                log_.info(f"shield filtered_videos count = {len(other_24h_video_ids)}")
+
+            other_24h_dup = {}
+            for video_id, score in other_24h_data:
+                if int(video_id) not in h_video_ids and int(video_id) in other_24h_video_ids:
+                    other_24h_dup[int(video_id)] = score
+                    h_video_ids.append(int(video_id))
+            log_.info(f"other 24h data dup count = {len(other_24h_dup)}")
+            other_24h_dup_key_name = \
+                f"{config_.RECALL_KEY_NAME_PREFIX_DUP3_REGION_24H_H}{region}.{app_type}.{data_key}.{rule_key}." \
+                f"{datetime.datetime.strftime(now_date, '%Y%m%d')}.{now_h}"
+            if len(other_24h_dup) > 0:
+                redis_helper.add_data_with_zset(key_name=other_24h_dup_key_name, data=other_24h_dup, expire_time=23 * 3600)
+                # 限流视频score调整
+                update_limit_video_score(initial_videos=other_24h_dup, key_name=other_24h_dup_key_name)
+
     # ##### 去重小程序模型更新结果,并另存为redis中
     model_key_name = get_rov_redis_key(now_date=now_date)
     model_data = redis_helper.get_all_data_from_zset(key_name=model_key_name, with_scores=True)

+ 32 - 5
rule_rank_h_by_24h.py

@@ -142,6 +142,11 @@ def video_rank_h(df, now_date, now_h, rule_key, param, app_type, data_key):
     # log_.info(f'initial data count = {len(initial_data)}')
     log_.info(f"app_type = {app_type}, videos_count = {len(df)}")
 
+    # videoid重复时,保留分值高
+    df = df.sort_values(by=['score'], ascending=False)
+    df = df.drop_duplicates(subset=['videoid'], keep='first')
+    df['videoid'] = df['videoid'].astype(int)
+
     # 获取符合进入召回源条件的视频
     return_count = param.get('return_count')
     if return_count:
@@ -152,9 +157,9 @@ def video_rank_h(df, now_date, now_h, rule_key, param, app_type, data_key):
     day_recall_df = day_recall_df[day_recall_df['platform_return_rate'] > platform_return_rate]
 
     # videoid重复时,保留分值高
-    day_recall_df = day_recall_df.sort_values(by=['score'], ascending=False)
-    day_recall_df = day_recall_df.drop_duplicates(subset=['videoid'], keep='first')
-    day_recall_df['videoid'] = day_recall_df['videoid'].astype(int)
+    # day_recall_df = day_recall_df.sort_values(by=['score'], ascending=False)
+    # day_recall_df = day_recall_df.drop_duplicates(subset=['videoid'], keep='first')
+    # day_recall_df['videoid'] = day_recall_df['videoid'].astype(int)
     day_recall_videos = day_recall_df['videoid'].to_list()
     log_.info(f'h_by24h_recall videos count = {len(day_recall_videos)}')
 
@@ -170,14 +175,36 @@ def video_rank_h(df, now_date, now_h, rule_key, param, app_type, data_key):
         score = day_recall_df[day_recall_df['videoid'] == video_id]['score']
         day_recall_result[int(video_id)] = float(score)
         day_video_ids.append(int(video_id))
-    day_recall_key_name = \
+    h_24h_recall_key_name = \
         f"{config_.RECALL_KEY_NAME_PREFIX_BY_24H}{app_type}.{data_key}.{rule_key}.{now_dt}.{now_h}"
     if len(day_recall_result) > 0:
         log_.info(f"count = {len(day_recall_result)}")
-        redis_helper.add_data_with_zset(key_name=day_recall_key_name, data=day_recall_result, expire_time=23 * 3600)
+        redis_helper.add_data_with_zset(key_name=h_24h_recall_key_name, data=day_recall_result, expire_time=23 * 3600)
         # 清空线上过滤应用列表
         redis_helper.del_keys(key_name=f"{config_.H_VIDEO_FILER_24H}{app_type}.{data_key}.{rule_key}")
 
+    if rule_key == 'rule3':
+        # 去重筛选结果,保留剩余数据并写入Redis
+        all_videos = df['videoid'].to_list()
+        log_.info(f'h_by24h_recall all videos count = {len(all_videos)}')
+        # 视频状态过滤
+        all_filtered_videos = filter_video_status(all_videos)
+        log_.info(f'all_filtered_videos count = {len(all_filtered_videos)}')
+        # 与筛选结果去重
+        other_videos = [video for video in all_filtered_videos if video not in day_video_ids]
+        log_.info(f'other_videos count = {len(other_videos)}')
+        # 写入对应的redis
+        other_24h_recall_result = {}
+        for video_id in other_videos:
+            score = df[df['videoid'] == video_id]['score']
+            other_24h_recall_result[int(video_id)] = float(score)
+        other_h_24h_recall_key_name = \
+            f"{config_.RECALL_KEY_NAME_PREFIX_BY_24H_OTHER}{app_type}.{data_key}.{rule_key}.{now_dt}.{now_h}"
+        if len(other_24h_recall_result) > 0:
+            log_.info(f"count = {len(other_24h_recall_result)}")
+            redis_helper.add_data_with_zset(key_name=other_h_24h_recall_key_name, data=other_24h_recall_result,
+                                            expire_time=23 * 3600)
+
     # 去重更新rov模型结果,并另存为redis中
     # initial_data_dup = {}
     # for video_id, score in initial_data:

+ 1 - 0
videos_filter.py

@@ -577,6 +577,7 @@ def filter_process_with_region(app_type, data_key, rule_key, region, now_date, n
         # config_.RECALL_KEY_NAME_PREFIX_DUP1_REGION_DAY_H,
         # config_.RECALL_KEY_NAME_PREFIX_DUP2_REGION_DAY_H,
         config_.RECALL_KEY_NAME_PREFIX_DUP2_REGION_24H_H,
+        config_.RECALL_KEY_NAME_PREFIX_DUP3_REGION_24H_H,
         config_.RECALL_KEY_NAME_PREFIX_DUP_REGION_H
     ]
     for i, key_prefix in enumerate(key_prefix_list):