فهرست منبع

add 24h dup data

liqian 2 سال پیش
والد
کامیت
41c7805d6a
2فایلهای تغییر یافته به همراه46 افزوده شده و 7 حذف شده
  1. 14 2
      config.py
  2. 32 5
      rule_rank_h_by_24h.py

+ 14 - 2
config.py

@@ -167,6 +167,8 @@ class BaseConfig(object):
             'rule_params': {
                 'rule2': {'cal_score_func': 2, 'return_count': 40, 'platform_return_rate': 0.001,
                           'view_type': 'preview'},
+                'rule3': {'cal_score_func': 2, 'return_count': 100, 'platform_return_rate': 0.001,
+                          'view_type': 'preview'},
             },
             'data_params': {
                 'data1': [APP_TYPE['VLOG'], ],
@@ -175,6 +177,7 @@ class BaseConfig(object):
             'params_list': [
                 {'data': 'data1', 'rule': 'rule2'},
                 {'data': 'data2', 'rule': 'rule2'},
+                {'data': 'data1', 'rule': 'rule3'},
             ],
         },
         APP_TYPE['LONG_VIDEO']: {
@@ -389,6 +392,8 @@ class BaseConfig(object):
                 # 'rule2': {'view_type': 'video-show', 'platform_return_rate': 0.001, 'region_24h_rule_key': 'rule2'},
                 'rule3': {'view_type': 'video-show-region', 'platform_return_rate': 0.001,
                           'region_24h_rule_key': 'rule2'},
+                'rule4': {'view_type': 'video-show-region', 'platform_return_rate': 0.001,
+                          'region_24h_rule_key': 'rule2', '24h_rule_key': 'rule3'},
             },
             'data_params': {
                 'data1': [APP_TYPE['VLOG'], ],
@@ -397,6 +402,7 @@ class BaseConfig(object):
             'params_list': [
                 {'data': 'data1', 'rule': 'rule3'},
                 {'data': 'data2', 'rule': 'rule3'},
+                {'data': 'data1', 'rule': 'rule4'},
             ],
         },
         APP_TYPE['LONG_VIDEO']: {
@@ -525,6 +531,9 @@ class BaseConfig(object):
     # 小程序小时级24h数据更新结果存放 redis key前缀,
     # 完整格式:com.weiqu.video.recall.item.score.apptype.24h.{appType}.{data_key}.{rule_key}.{date}.{h}
     RECALL_KEY_NAME_PREFIX_BY_24H = 'com.weiqu.video.recall.item.score.apptype.24h.'
+    # 小程序小时级24h数据 筛选后的剩余数据 更新结果存放 redis key前缀,
+    # 完整格式:com.weiqu.video.recall.item.score.apptype.24h.other.{appType}.{data_key}.{rule_key}.{date}.{h}
+    RECALL_KEY_NAME_PREFIX_BY_24H_OTHER = 'com.weiqu.video.recall.item.score.apptype.24h.other.'
     # 小程序离线ROV模型结果与小程序小时级24h更新结果去重后 存放 redis key前缀,
     # 完整格式:com.weiqu.video.recall.hot.item.score.dup.24h.{rule_key}.{date}.{h}
     RECALL_KEY_NAME_PREFIX_DUP_24H = 'com.weiqu.video.recall.hot.item.score.dup.24h.'
@@ -546,6 +555,9 @@ class BaseConfig(object):
     # 小程序24h更新结果与 小程序地域分组24h更新结果/小程序地域分组小时级更新结果 去重后 存放 redis key前缀,
     # 完整格式:com.weiqu.video.recall.hot.item.score.dup2.apptype.region.24h.h.{region}.{appType}.{data_key}.{rule_key}.{date}.{h}
     RECALL_KEY_NAME_PREFIX_DUP2_REGION_24H_H = 'com.weiqu.video.recall.hot.item.score.dup2.apptype.region.24h.h.'
+    # 小程序小时级24h数据 筛选后的剩余数据 更新结果 与 小程序24h更新结果/小程序地域分组24h更新结果/小程序地域分组小时级更新结果 去重后 存放 redis key前缀,
+    # 完整格式:com.weiqu.video.recall.hot.item.score.dup3.apptype.region.24h.h.{appType}.{data_key}.{rule_key}.{date}.{h}
+    RECALL_KEY_NAME_PREFIX_DUP3_REGION_24H_H = 'com.weiqu.video.recall.hot.item.score.dup3.apptype.region.24h.h.'
     # 小程序离线ROV模型结果与 小程序天级更新结果/小程序地域分组天级更新结果/小程序地域分组小时级更新结果 去重后 存放 redis key前缀,
     # 完整格式:com.weiqu.video.recall.hot.item.score.dup.apptype.region.h.{region}.{appType}.{data_key}.{rule_key}.{date}.{h}
     RECALL_KEY_NAME_PREFIX_DUP_REGION_H = 'com.weiqu.video.recall.hot.item.score.dup.apptype.region.h.'
@@ -988,8 +1000,8 @@ class ProductionConfig(BaseConfig):
 
 def set_config():
     # 获取环境变量 ROV_OFFLINE_ENV
-    env = os.environ.get('ROV_OFFLINE_ENV')
-    # env = 'dev'
+    # env = os.environ.get('ROV_OFFLINE_ENV')
+    env = 'dev'
     if env is None:
         # log_.error('ENV ERROR: is None!')
         return

+ 32 - 5
rule_rank_h_by_24h.py

@@ -142,6 +142,11 @@ def video_rank_h(df, now_date, now_h, rule_key, param, app_type, data_key):
     # log_.info(f'initial data count = {len(initial_data)}')
     log_.info(f"app_type = {app_type}, videos_count = {len(df)}")
 
+    # videoid重复时,保留分值高
+    df = df.sort_values(by=['score'], ascending=False)
+    df = df.drop_duplicates(subset=['videoid'], keep='first')
+    df['videoid'] = df['videoid'].astype(int)
+
     # 获取符合进入召回源条件的视频
     return_count = param.get('return_count')
     if return_count:
@@ -152,9 +157,9 @@ def video_rank_h(df, now_date, now_h, rule_key, param, app_type, data_key):
     day_recall_df = day_recall_df[day_recall_df['platform_return_rate'] > platform_return_rate]
 
     # videoid重复时,保留分值高
-    day_recall_df = day_recall_df.sort_values(by=['score'], ascending=False)
-    day_recall_df = day_recall_df.drop_duplicates(subset=['videoid'], keep='first')
-    day_recall_df['videoid'] = day_recall_df['videoid'].astype(int)
+    # day_recall_df = day_recall_df.sort_values(by=['score'], ascending=False)
+    # day_recall_df = day_recall_df.drop_duplicates(subset=['videoid'], keep='first')
+    # day_recall_df['videoid'] = day_recall_df['videoid'].astype(int)
     day_recall_videos = day_recall_df['videoid'].to_list()
     log_.info(f'h_by24h_recall videos count = {len(day_recall_videos)}')
 
@@ -170,14 +175,36 @@ def video_rank_h(df, now_date, now_h, rule_key, param, app_type, data_key):
         score = day_recall_df[day_recall_df['videoid'] == video_id]['score']
         day_recall_result[int(video_id)] = float(score)
         day_video_ids.append(int(video_id))
-    day_recall_key_name = \
+    h_24h_recall_key_name = \
         f"{config_.RECALL_KEY_NAME_PREFIX_BY_24H}{app_type}.{data_key}.{rule_key}.{now_dt}.{now_h}"
     if len(day_recall_result) > 0:
         log_.info(f"count = {len(day_recall_result)}")
-        redis_helper.add_data_with_zset(key_name=day_recall_key_name, data=day_recall_result, expire_time=23 * 3600)
+        redis_helper.add_data_with_zset(key_name=h_24h_recall_key_name, data=day_recall_result, expire_time=23 * 3600)
         # 清空线上过滤应用列表
         redis_helper.del_keys(key_name=f"{config_.H_VIDEO_FILER_24H}{app_type}.{data_key}.{rule_key}")
 
+    if rule_key == 'rule3':
+        # 去重筛选结果,保留剩余数据并写入Redis
+        all_videos = df['videoid'].to_list()
+        log_.info(f'h_by24h_recall all videos count = {len(all_videos)}')
+        # 视频状态过滤
+        all_filtered_videos = filter_video_status(all_videos)
+        log_.info(f'all_filtered_videos count = {len(all_filtered_videos)}')
+        # 与筛选结果去重
+        other_videos = [video for video in all_filtered_videos if video not in day_video_ids]
+        log_.info(f'other_videos count = {len(other_videos)}')
+        # 写入对应的redis
+        other_24h_recall_result = {}
+        for video_id in other_videos:
+            score = df[df['videoid'] == video_id]['score']
+            other_24h_recall_result[int(video_id)] = float(score)
+        other_h_24h_recall_key_name = \
+            f"{config_.RECALL_KEY_NAME_PREFIX_BY_24H_OTHER}{app_type}.{data_key}.{rule_key}.{now_dt}.{now_h}"
+        if len(other_24h_recall_result) > 0:
+            log_.info(f"count = {len(other_24h_recall_result)}")
+            redis_helper.add_data_with_zset(key_name=other_h_24h_recall_key_name, data=other_24h_recall_result,
+                                            expire_time=23 * 3600)
+
     # 去重更新rov模型结果,并另存为redis中
     # initial_data_dup = {}
     # for video_id, score in initial_data: