Browse Source

Update alg_growth_gh_reply_video_v1: quit video in layer1 by stats

StrayWarrior 5 months ago
parent
commit
2cfaec4917
1 changed files with 30 additions and 1 deletions
  1. 30 1
      alg_growth_gh_reply_video_v1.py

+ 30 - 1
alg_growth_gh_reply_video_v1.py

@@ -35,6 +35,7 @@ ODPS_RANK_RESULT_TABLE = 'alg_gh_autoreply_video_rank_data'
 RDS_RANK_RESULT_TABLE = 'alg_gh_autoreply_video_rank_data'
 GH_DETAIL = 'gh_detail'
 STATS_PERIOD_DAYS = 5
+STATS_PERIOD_DAYS_FOR_QUIT = 30
 SEND_N = 2
 
 pd.set_option('display.max_rows', None)
@@ -111,9 +112,37 @@ def process_reply_stats(project, daily_table, hourly_table, period, run_dt, run_
 
 
 def rank_for_layer1(run_dt, run_hour, project, table, gh_df):
-    # TODO: 加审核&退场
+    # TODO: 加审核
     df = get_odps_df_of_max_partition(project, table, {'dt': run_dt})
     df = df.to_pandas()
+
+    # use statistic data to quit some low-efficiency video
+    stats_df = get_odps_df_of_recent_partitions(
+        ODS_PROJECT, GH_REPLY_STATS_TABLE, 30, {'dt': run_dt}).to_pandas()
+    stats_df['video_id'] = stats_df['video_id'].astype('int64')
+    stats_df = stats_df[['video_id', 'send_count', 'first_visit_uv', 'day0_return']]
+    stats_df = stats_df.groupby(['video_id']).agg({
+        'send_count': 'sum',
+        'first_visit_uv': 'sum',
+        'day0_return': 'sum'
+    })
+    # do not add to denominator
+    stats_df['return_by_send'] = stats_df['day0_return'] / (stats_df['send_count'])
+    stats_df['open_rate'] = stats_df['first_visit_uv'] / (stats_df['send_count'])
+    # do not filter video that does not have enough data
+    stats_df = stats_df.query('send_count > 1000')
+
+    df = df.merge(stats_df, on='video_id', how='left')
+    open_rate_threshold = df.open_rate.quantile(q=0.2)
+    return_by_send_threshold = df.return_by_send.quantile(q=0.2)
+
+    filter_condition = 'open_rate < {} and return_by_send < {}' \
+        .format(open_rate_threshold, return_by_send_threshold)
+    filter_rows = df.query(filter_condition)
+    df = df.drop(filter_rows.index)
+    print("low-efficient video to quit:")
+    print(filter_rows[['video_id', 'title', 'send_count', 'open_rate', 'return_by_send']])
+
     # 确保重跑时可获得一致结果
     dt_version = f'{run_dt}{run_hour}'
     np.random.seed(int(dt_version) + 1)