| 
					
				 | 
			
			
				@@ -35,6 +35,7 @@ ODPS_RANK_RESULT_TABLE = 'alg_gh_autoreply_video_rank_data' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 RDS_RANK_RESULT_TABLE = 'alg_gh_autoreply_video_rank_data' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 GH_DETAIL = 'gh_detail' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 STATS_PERIOD_DAYS = 5 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+STATS_PERIOD_DAYS_FOR_QUIT = 30 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 SEND_N = 2 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 pd.set_option('display.max_rows', None) 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -111,9 +112,37 @@ def process_reply_stats(project, daily_table, hourly_table, period, run_dt, run_ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 def rank_for_layer1(run_dt, run_hour, project, table, gh_df): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    # TODO: 加审核&退场 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    # TODO: 加审核 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     df = get_odps_df_of_max_partition(project, table, {'dt': run_dt}) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     df = df.to_pandas() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    # use statistic data to quit some low-efficiency video 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    stats_df = get_odps_df_of_recent_partitions( 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        ODS_PROJECT, GH_REPLY_STATS_TABLE, 30, {'dt': run_dt}).to_pandas() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    stats_df['video_id'] = stats_df['video_id'].astype('int64') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    stats_df = stats_df[['video_id', 'send_count', 'first_visit_uv', 'day0_return']] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    stats_df = stats_df.groupby(['video_id']).agg({ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        'send_count': 'sum', 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        'first_visit_uv': 'sum', 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        'day0_return': 'sum' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    }) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    # do not add to denominator 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    stats_df['return_by_send'] = stats_df['day0_return'] / (stats_df['send_count']) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    stats_df['open_rate'] = stats_df['first_visit_uv'] / (stats_df['send_count']) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    # do not filter video that does not have enough data 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    stats_df = stats_df.query('send_count > 1000') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    df = df.merge(stats_df, on='video_id', how='left') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    open_rate_threshold = df.open_rate.quantile(q=0.2) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    return_by_send_threshold = df.return_by_send.quantile(q=0.2) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    filter_condition = 'open_rate < {} and return_by_send < {}' \ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        .format(open_rate_threshold, return_by_send_threshold) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    filter_rows = df.query(filter_condition) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    df = df.drop(filter_rows.index) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    print("low-efficient video to quit:") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    print(filter_rows[['video_id', 'title', 'send_count', 'open_rate', 'return_by_send']]) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     # 确保重跑时可获得一致结果 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     dt_version = f'{run_dt}{run_hour}' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     np.random.seed(int(dt_version) + 1) 
			 |