Jelajahi Sumber

523实验 地域1小时 更新公式

zhangbo 1 tahun lalu
induk
melakukan
9594130944
2 mengubah file dengan 64 tambahan dan 7 penghapusan
  1. 60 5
      alg_recsys_recall_1h_region.py
  2. 4 2
      region_rule_rank_h_v2.py

+ 60 - 5
alg_recsys_recall_1h_region.py

@@ -23,7 +23,8 @@ region_code = config_.REGION_CODE
 RULE_PARAMS = {
     'rule_params': {
         'rule66': {
-            'view_type': 'video-show-region', 'platform_return_rate': 0.001,
+            'view_type': 'video-show-region',
+            'score_func': '20240223',
             'region_24h_rule_key': 'rule66', '24h_rule_key': 'rule66'
         },
         'rule67': {
@@ -40,7 +41,7 @@ RULE_PARAMS = {
     'data_params': config_.DATA_PARAMS,
     'params_list': [
         # 532
-        # {'data': 'data66', 'rule': 'rule66'},  # 523-> 523 & 518
+        {'data': 'data66', 'rule': 'rule66'},  # 523-> 523 & 518
         # {'data': 'data66', 'rule': 'rule67'},  # 523->510
         # {'data': 'data66', 'rule': 'rule68'},  # 523->514
         # {'data': 'data66', 'rule': 'rule69'},  # 523->518
@@ -76,6 +77,9 @@ features = [
     'lastthreehour_return_now_new',  # h-3分享,过去1小时回流人数(回流统计为对应地域分享带回的回流,分享限制地域,回流不限制地域)
     'lastthreehour_return_new',  # h-3分享,h-3回流人数(回流统计为对应地域分享带回的回流,分享限制地域,回流不限制地域)
     'platform_return_new',  # 平台分发回流(回流统计为对应地域分享带回的回流,分享限制地域,回流不限制地域)
+
+    'lastonehour_allreturn',
+    'lastonehour_allsharecnt'
 ]
 
 
@@ -144,7 +148,8 @@ def get_day_30day_videos(now_date, data_key, rule_key):
 def get_feature_data(project, table, now_date):
     """获取特征数据"""
     dt = datetime.datetime.strftime(now_date, '%Y%m%d%H')
-    # dt = '2022041310'
+    # 张博 测试
+    dt = '2024022319all1last0'
     records = get_data_from_odps(date=dt, project=project, table=table)
     feature_data = []
     for record in records:
@@ -156,6 +161,36 @@ def get_feature_data(project, table, now_date):
     return feature_df
 
 
+def cal_score_initial_20240223(df, param):
+    """
+    计算score
+    :param df: 特征数据
+    :param param: 规则参数
+    :return:
+    """
+    df = df.fillna(0)
+    df['share_rate'] = df['lastonehour_share'] / (df['lastonehour_play'] + 1000)
+    df['back_rate'] = df['lastonehour_return'] / (df['lastonehour_share'] + 10)
+    df['back_rate_new'] = (df['lastonehour_return'] + 1) / (df['lastonehour_share'] + 10)
+    df['back_rate_all'] = df['lastonehour_allreturn'] / (df['lastonehour_allsharecnt'] + 10)
+    df['log_back'] = (df['lastonehour_return'] + 1).apply(math.log)
+    df['log_back_all'] = (df['lastonehour_allreturn'] + 1).apply(math.log)
+    if param.get('view_type', None) == 'video-show':
+        df['ctr'] = df['lastonehour_play'] / (df['lastonehour_show'] + 1000)
+    elif param.get('view_type', None) == 'video-show-region':
+        df['ctr'] = df['lastonehour_play'] / (df['lastonehour_show_region'] + 1000)
+    else:
+        df['ctr'] = df['lastonehour_play'] / (df['lastonehour_preview'] + 1000)
+    df['K2'] = df['ctr'].apply(lambda x: 0.6 if x > 0.6 else x)
+    df['platform_return_rate'] = df['platform_return'] / df['lastonehour_return']
+    df['score'] = df['share_rate'] * (
+        df['back_rate_new'] + 0.01 * df['back_rate_all']
+    ) * (
+            df['log_back'] + 0.01 * df['log_back_all']
+    ) * df['K2']
+    df = df.sort_values(by=['score'], ascending=False)
+    return df
+
 def cal_score_initial(df, param):
     """
     计算score
@@ -527,6 +562,8 @@ def cal_score(df, param):
             df = cal_score_with_back_rate_exponential_weighting2(df=df, param=param)
         elif param.get('score_func', None) == 'back_rate_rank_weighting':
             df = cal_score_with_back_rate_by_rank_weighting(df=df, param=param)
+        elif param.get('score_func', None) == '20240223':
+            df = cal_score_initial_20240223(df=df, param=param)
         else:
             df = cal_score_initial(df=df, param=param)
     return df
@@ -618,8 +655,26 @@ def video_rank(df, now_date, now_h, rule_key, param, region, data_key, rule_rank
     return_count = param.get('return_count', 1)
     score_value = param.get('score_rule', 0)
     platform_return_rate = param.get('platform_return_rate', 0)
-    h_recall_df = df[(df['lastonehour_return'] >= return_count) & (df['score'] >= score_value)
-                     & (df['platform_return_rate'] >= platform_return_rate)]
+    # h_recall_df = df[(df['lastonehour_return'] >= return_count) & (df['score'] >= score_value)
+    #                  & (df['platform_return_rate'] >= platform_return_rate)]
+    # h_recall_df = df[
+    #     (df['lastonehour_return'] >= return_count) &
+    #     (df['score'] >= score_value) &
+    #     (df['platform_return_rate'] >= platform_return_rate)
+    #     ]
+    h_recall_df = df[
+        (df['lastonehour_allreturn'] > 0)
+        ]
+    # try:
+    #     if "return_countv2" in param.keys() and "platform_return_ratev2" in param.keys():
+    #         return_countv2 = param["return_countv2"]
+    #         platform_return_ratev2 = param["platform_return_ratev2"]
+    #         h_recall_df = h_recall_df[
+    #             df['platform_return_rate'] >= platform_return_ratev2 |
+    #             (df['platform_return_rate'] < platform_return_ratev2 & df['lastonehour_return'] > return_countv2)
+    #             ]
+    # except Exception as e:
+    #     log_.error("return_countv2 is wrong with{}".format(e))
 
     # videoid重复时,保留分值高
     h_recall_df = h_recall_df.sort_values(by=['score'], ascending=False)

+ 4 - 2
region_rule_rank_h_v2.py

@@ -720,12 +720,12 @@ def process_with_region(region, df_merged, data_key, rule_key, rule_param, now_d
     log_.info(f"region = {region} start...")
     # 计算score
     region_df = df_merged[df_merged['code'] == region]
-    log_.info(f'region = {region}, region_df count = {len(region_df)}')
+    log_.info(f'该区域region = {region}, 下有多少数据量 = {len(region_df)}')
     score_df = cal_score(df=region_df, param=rule_param)
     video_rank(df=score_df, now_date=now_date, now_h=now_h, rule_key=rule_key, param=rule_param,
                region=region, data_key=data_key, rule_rank_h_flag=rule_rank_h_flag,
                add_videos_with_pre_h=add_videos_with_pre_h, hour_count=hour_count)
-    log_.info(f"region = {region} end!")
+    log_.info(f"多协程的region = {region} 完成执行")
 
 
 def process_with_region2(region, df_merged, data_key, rule_key, rule_param, now_date, now_h,
@@ -810,6 +810,8 @@ def process_with_param(param, data_params_item, rule_params_item, region_code_li
     rule_param = rule_params_item.get(rule_key)
     log_.info(f"rule_key = {rule_key}, rule_param = {rule_param}")
     merge_func = rule_param.get('merge_func', None)
+    log_.info("数据采用:{},统计采用{}.".format(data_key, rule_key))
+    log_.info("具体的规则是:{}.".format(rule_param))
     # 是否在地域小时级数据中增加打捞的优质视频
     add_videos_with_pre_h = rule_param.get('add_videos_with_pre_h', False)
     hour_count = rule_param.get('hour_count', 0)