Prechádzať zdrojové kódy

add abtest-153 data

liqian 2 rokov pred
rodič
commit
12cf0bba32
2 zmenil súbory, kde vykonal 58 pridanie a 5 odobranie
  1. 4 0
      config.py
  2. 54 5
      region_rule_rank_h.py

+ 4 - 0
config.py

@@ -290,6 +290,9 @@ class BaseConfig(object):
             # 有回流人群
             'rule11': {'view_type': 'video-show-region', 'platform_return_rate': 0.001,
                        'region_24h_rule_key': 'rule7', '24h_rule_key': 'rule6', 'back_score_rate': 0.7},
+            # 19点地域小时级列表中增加7点-18点地域小时级的优质视频
+            'rule12': {'view_type': 'video-show-region', 'platform_return_rate': 0.001,
+                       'region_24h_rule_key': 'rule2', '24h_rule_key': 'rule3', 'add_videos_in_19h': True},
         },
         'data_params': DATA_PARAMS,
         'params_list': [
@@ -309,6 +312,7 @@ class BaseConfig(object):
             {'data': 'data11', 'rule': 'rule7'},
             {'data': 'data12', 'rule': 'rule7'},
             {'data': 'data13', 'rule': 'rule7'},
+            {'data': 'data1', 'rule': 'rule12'},
         ],
     }
 

+ 54 - 5
region_rule_rank_h.py

@@ -165,7 +165,41 @@ def cal_score(df, param):
     return df
 
 
-def video_rank(df, now_date, now_h, rule_key, param, region, data_key, rule_rank_h_flag):
+def add_videos(initial_df, now_date, rule_key, region, data_key):
+    """
+    19点地域小时级数据列表中增加7点-18点优质视频
+    :param initial_df: 19点地域小时级筛选结果
+    :param now_date:
+    :param data_key:
+    :param region:
+    :param rule_key:
+    :return: df
+    """
+    redis_helper = RedisHelper()
+    pre_h_data = []
+    for pre_h in range(7, 19):
+        pre_h_recall_key_name = f"{config_.RECALL_KEY_NAME_PREFIX_REGION_BY_H}{region}:{data_key}:{rule_key}:" \
+                                f"{datetime.datetime.strftime(now_date, '%Y%m%d')}:{pre_h}"
+        initial_data = redis_helper.get_all_data_from_zset(key_name=pre_h_recall_key_name, with_scores=True)
+        if initial_data is None:
+            continue
+        pre_h_data.extend(initial_data)
+    pre_h_df = pd.DataFrame(data=pre_h_data, columns=['videoid', 'score'])
+    score_list = initial_df['score'].to_list()
+    if len(score_list) > 0:
+        min_score = min(score_list)
+    else:
+        min_score = 0
+    pre_h_df = pre_h_df[pre_h_df['score'] > min_score]
+    df = pd.concat([initial_df, pre_h_df], ignore_index=True)
+    # videoid去重,保留分值高
+    df['videoid'] = df['videoid'].astype(int)
+    df = df.sort_values(by=['score'], ascending=False)
+    df = df.drop_duplicates(subset=['videoid'], keep="first")
+    return df
+
+
+def video_rank(df, now_date, now_h, rule_key, param, region, data_key, rule_rank_h_flag, add_videos_in_19h):
     """
     获取符合进入召回源条件的视频,与每日更新的rov模型结果视频列表进行合并
     :param df:
@@ -184,10 +218,20 @@ def video_rank(df, now_date, now_h, rule_key, param, region, data_key, rule_rank
     platform_return_rate = param.get('platform_return_rate', 0)
     h_recall_df = df[(df['lastonehour_return'] >= return_count) & (df['score'] >= score_value)
                      & (df['platform_return_rate'] >= platform_return_rate)]
+
     # videoid重复时,保留分值高
     h_recall_df = h_recall_df.sort_values(by=['score'], ascending=False)
     h_recall_df = h_recall_df.drop_duplicates(subset=['videoid'], keep='first')
     h_recall_df['videoid'] = h_recall_df['videoid'].astype(int)
+
+    # 19点增加打捞的优质视频
+    if now_h == 19 and add_videos_in_19h is True:
+        # print(len(h_recall_df))
+        h_recall_df = add_videos(initial_df=h_recall_df, now_date=now_date, rule_key=rule_key,
+                                 region=region, data_key=data_key)
+        # print(len(h_recall_df))
+
+
     h_recall_videos = h_recall_df['videoid'].to_list()
     # log_.info(f'h_recall videos count = {len(h_recall_videos)}')
 
@@ -359,14 +403,16 @@ def merge_df_with_score(df_left, df_right):
     return df_merged[feature_list]
 
 
-def process_with_region(region, df_merged, data_key, rule_key, rule_param, now_date, now_h, rule_rank_h_flag):
+def process_with_region(region, df_merged, data_key, rule_key, rule_param, now_date, now_h,
+                        rule_rank_h_flag, add_videos_in_19h):
     log_.info(f"region = {region} start...")
     # 计算score
     region_df = df_merged[df_merged['code'] == region]
     log_.info(f'region = {region}, region_df count = {len(region_df)}')
     score_df = cal_score(df=region_df, param=rule_param)
     video_rank(df=score_df, now_date=now_date, now_h=now_h, rule_key=rule_key, param=rule_param,
-               region=region, data_key=data_key, rule_rank_h_flag=rule_rank_h_flag)
+               region=region, data_key=data_key, rule_rank_h_flag=rule_rank_h_flag,
+               add_videos_in_19h=add_videos_in_19h)
     log_.info(f"region = {region} end!")
 
 
@@ -471,6 +517,8 @@ def process_with_param(param, data_params_item, rule_params_item, region_code_li
     rule_param = rule_params_item.get(rule_key)
     log_.info(f"rule_key = {rule_key}, rule_param = {rule_param}")
     merge_func = rule_param.get('merge_func', None)
+    # 是否在19点的数据中增加打捞的优质视频
+    add_videos_in_19h = rule_param.get('add_videos_in_19h', False)
 
     if merge_func == 2:
         score_df_list = []
@@ -494,7 +542,8 @@ def process_with_param(param, data_params_item, rule_params_item, region_code_li
         df_merged = reduce(merge_df, df_list)
         task_list = [
             gevent.spawn(process_with_region,
-                         region, df_merged, data_key, rule_key, rule_param, now_date, now_h, rule_rank_h_flag)
+                         region, df_merged, data_key, rule_key, rule_param, now_date, now_h, rule_rank_h_flag,
+                         add_videos_in_19h)
             for region in region_code_list
         ]
 
@@ -522,7 +571,7 @@ def rank_by_h(project, table, now_date, now_h, rule_params, region_code_list, ru
     rule_params_item = rule_params.get('rule_params')
     params_list = rule_params.get('params_list')
     pool = multiprocessing.Pool(processes=len(params_list))
-    for param in params_list:
+    for param in params_list[0:1]:
         pool.apply_async(
             func=process_with_param,
             args=(param, data_params_item, rule_params_item, region_code_list, feature_df, now_date, now_h, rule_rank_h_flag)