há 1 ano atrás · 8c46f48b52
--- a/alg_growth_gh_reply_video_v1.py
+++ b/alg_growth_gh_reply_video_v1.py
@@ -35,6 +35,7 @@ ODPS_RANK_RESULT_TABLE = 'alg_gh_autoreply_video_rank_data'
 
				 RDS_RANK_RESULT_TABLE = 'alg_gh_autoreply_video_rank_data'
			
 
				 GH_DETAIL = 'gh_detail'
			
 
				 STATS_PERIOD_DAYS = 5
			
 
				+STATS_PERIOD_DAYS_FOR_QUIT = 30
			
 
				 SEND_N = 2
			
 
				 
			
 
				 pd.set_option('display.max_rows', None)
			
@@ -111,9 +112,37 @@ def process_reply_stats(project, daily_table, hourly_table, period, run_dt, run_
 
				 
			
 
				 
			
 
				 def rank_for_layer1(run_dt, run_hour, project, table, gh_df):
			
 
				-    # TODO: 加审核&退场
			
 
				+    # TODO: 加审核
			
 
				     df = get_odps_df_of_max_partition(project, table, {'dt': run_dt})
			
 
				     df = df.to_pandas()
			
 
				+
			
 
				+    # use statistic data to quit some low-efficiency video
			
 
				+    stats_df = get_odps_df_of_recent_partitions(
			
 
				+        ODS_PROJECT, GH_REPLY_STATS_TABLE, 30, {'dt': run_dt}).to_pandas()
			
 
				+    stats_df['video_id'] = stats_df['video_id'].astype('int64')
			
 
				+    stats_df = stats_df[['video_id', 'send_count', 'first_visit_uv', 'day0_return']]
			
 
				+    stats_df = stats_df.groupby(['video_id']).agg({
			
 
				+        'send_count': 'sum',
			
 
				+        'first_visit_uv': 'sum',
			
 
				+        'day0_return': 'sum'
			
 
				+    })
			
 
				+    # do not add to denominator
			
 
				+    stats_df['return_by_send'] = stats_df['day0_return'] / (stats_df['send_count'])
			
 
				+    stats_df['open_rate'] = stats_df['first_visit_uv'] / (stats_df['send_count'])
			
 
				+    # do not filter video that does not have enough data
			
 
				+    stats_df = stats_df.query('send_count > 1000')
			
 
				+
			
 
				+    df = df.merge(stats_df, on='video_id', how='left')
			
 
				+    open_rate_threshold = df.open_rate.quantile(q=0.2)
			
 
				+    return_by_send_threshold = df.return_by_send.quantile(q=0.2)
			
 
				+
			
 
				+    filter_condition = 'open_rate < {} and return_by_send < {}' \
			
 
				+        .format(open_rate_threshold, return_by_send_threshold)
			
 
				+    filter_rows = df.query(filter_condition)
			
 
				+    df = df.drop(filter_rows.index)
			
 
				+    print("low-efficient video to quit:")
			
 
				+    print(filter_rows[['video_id', 'title', 'send_count', 'open_rate', 'return_by_send']])
			
 
				+
			
 
				     # 确保重跑时可获得一致结果
			
 
				     dt_version = f'{run_dt}{run_hour}'
			
 
				     np.random.seed(int(dt_version) + 1)