| 
														
															@@ -35,6 +35,7 @@ ODPS_RANK_RESULT_TABLE = 'alg_gh_autoreply_video_rank_data' 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 RDS_RANK_RESULT_TABLE = 'alg_gh_autoreply_video_rank_data' 
														 | 
														
														 | 
														
															 RDS_RANK_RESULT_TABLE = 'alg_gh_autoreply_video_rank_data' 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 GH_DETAIL = 'gh_detail' 
														 | 
														
														 | 
														
															 GH_DETAIL = 'gh_detail' 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 STATS_PERIOD_DAYS = 5 
														 | 
														
														 | 
														
															 STATS_PERIOD_DAYS = 5 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+STATS_PERIOD_DAYS_FOR_QUIT = 30 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 SEND_N = 2 
														 | 
														
														 | 
														
															 SEND_N = 2 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															 pd.set_option('display.max_rows', None) 
														 | 
														
														 | 
														
															 pd.set_option('display.max_rows', None) 
														 | 
													
												
											
										
											
												
													
														 | 
														
															@@ -111,9 +112,37 @@ def process_reply_stats(project, daily_table, hourly_table, period, run_dt, run_ 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															 def rank_for_layer1(run_dt, run_hour, project, table, gh_df): 
														 | 
														
														 | 
														
															 def rank_for_layer1(run_dt, run_hour, project, table, gh_df): 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-    # TODO: 加审核&退场 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    # TODO: 加审核 
														 | 
													
												
											
												
													
														| 
														 | 
														
															     df = get_odps_df_of_max_partition(project, table, {'dt': run_dt}) 
														 | 
														
														 | 
														
															     df = get_odps_df_of_max_partition(project, table, {'dt': run_dt}) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															     df = df.to_pandas() 
														 | 
														
														 | 
														
															     df = df.to_pandas() 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+ 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    # use statistic data to quit some low-efficiency video 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    stats_df = get_odps_df_of_recent_partitions( 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        ODS_PROJECT, GH_REPLY_STATS_TABLE, 30, {'dt': run_dt}).to_pandas() 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    stats_df['video_id'] = stats_df['video_id'].astype('int64') 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    stats_df = stats_df[['video_id', 'send_count', 'first_visit_uv', 'day0_return']] 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    stats_df = stats_df.groupby(['video_id']).agg({ 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        'send_count': 'sum', 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        'first_visit_uv': 'sum', 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        'day0_return': 'sum' 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    }) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    # do not add to denominator 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    stats_df['return_by_send'] = stats_df['day0_return'] / (stats_df['send_count']) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    stats_df['open_rate'] = stats_df['first_visit_uv'] / (stats_df['send_count']) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    # do not filter video that does not have enough data 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    stats_df = stats_df.query('send_count > 1000') 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+ 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    df = df.merge(stats_df, on='video_id', how='left') 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    open_rate_threshold = df.open_rate.quantile(q=0.2) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    return_by_send_threshold = df.return_by_send.quantile(q=0.2) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+ 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    filter_condition = 'open_rate < {} and return_by_send < {}' \ 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        .format(open_rate_threshold, return_by_send_threshold) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    filter_rows = df.query(filter_condition) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    df = df.drop(filter_rows.index) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    print("low-efficient video to quit:") 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    print(filter_rows[['video_id', 'title', 'send_count', 'open_rate', 'return_by_send']]) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+ 
														 | 
													
												
											
												
													
														| 
														 | 
														
															     # 确保重跑时可获得一致结果 
														 | 
														
														 | 
														
															     # 确保重跑时可获得一致结果 
														 | 
													
												
											
												
													
														| 
														 | 
														
															     dt_version = f'{run_dt}{run_hour}' 
														 | 
														
														 | 
														
															     dt_version = f'{run_dt}{run_hour}' 
														 | 
													
												
											
												
													
														| 
														 | 
														
															     np.random.seed(int(dt_version) + 1) 
														 | 
														
														 | 
														
															     np.random.seed(int(dt_version) + 1) 
														 |