hace 1 año · caa03ede09
--- a/alg_growth_gh_reply_video_v1.py
+++ b/alg_growth_gh_reply_video_v1.py
@@ -31,6 +31,7 @@ GH_REPLY_STATS_TABLE = 'alg_growth_gh_reply_video_stats'
 
				 ODPS_RANK_RESULT_TABLE = 'alg_gh_autoreply_video_rank_data'
			
 
				 RDS_RANK_RESULT_TABLE = 'alg_gh_autoreply_video_rank_data'
			
 
				 STATS_PERIOD_DAYS = 3
			
 
				+SEND_N = 2
			
 
				 
			
 
				 def check_data_partition(project, table, data_dt, data_hr=None):
			
 
				     """检查数据是否准备好"""
			
@@ -85,7 +86,7 @@ def rank_for_layer1(run_dt, run_hour, project, table):
 
				     # TODO: 修改权重计算策略
			
 
				     sample_weights = df['rov']
			
 
				 
			
 
				-    sampled_df = df.sample(n=2, weights=sample_weights)
			
 
				+    sampled_df = df.sample(n=SEND_N, weights=sample_weights)
			
 
				     sampled_df['sort'] = range(1, len(sampled_df) + 1)
			
 
				     sampled_df['strategy_key'] = EXPLORE1_GROUP_NAME
			
 
				     sampled_df['dt_version'] = dt_version
			
@@ -113,7 +114,7 @@ def rank_for_layer2(run_dt, run_hour, project, table):
 
				     sampled_dfs = []
			
 
				     # 处理default逻辑（default-explore2）
			
 
				     default_stats_df = stats_df.query('gh_id == "default"')
			
 
				-    sampled_df = default_stats_df.sample(n=2, weights=default_stats_df['score'])
			
 
				+    sampled_df = default_stats_df.sample(n=SEND_N, weights=default_stats_df['score'])
			
 
				     sampled_df['sort'] = range(1, len(sampled_df) + 1)
			
 
				     sampled_dfs.append(sampled_df)
			
 
				 
			
@@ -123,10 +124,10 @@ def rank_for_layer2(run_dt, run_hour, project, table):
 
				     # TODO: 个数不足时的兜底逻辑
			
 
				     for gh_id in GH_IDS:
			
 
				         sub_df = df.query(f'gh_id == "{gh_id}"')
			
 
				-        sampled_df = sub_df.sample(n=2, weights=sub_df['score'])
			
 
				+        sampled_df = sub_df.sample(n=SEND_N, weights=sub_df['score'])
			
 
				         sampled_df['sort'] = range(1, len(sampled_df) + 1)
			
 
				         sampled_dfs.append(sampled_df)
			
 
				-        if len(sampled_df) != 2:
			
 
				+        if len(sampled_df) != SEND_N:
			
 
				             raise
			
 
				 
			
 
				     extend_df = pd.concat(sampled_dfs)
			
@@ -170,7 +171,7 @@ def rank_for_base(run_dt, run_hour, project, stats_table, rank_table):
 
				         top_n = group_sorted.head(n)
			
 
				         top_n['sort'] = range(1, n + 1)
			
 
				         return top_n
			
 
				-    ranked_df = grouped_stats_df.groupby('gh_id').apply(set_top_n)
			
 
				+    ranked_df = grouped_stats_df.groupby('gh_id').apply(set_top_n, SEND_N)
			
 
				     ranked_df = ranked_df.reset_index(drop=True)
			
 
				     #ranked_df['sort'] = grouped_stats_df.groupby('gh_id')['score'].rank(ascending=False)
			
 
				     ranked_df['strategy_key'] = BASE_GROUP_NAME
			
@@ -199,6 +200,9 @@ def build_and_transfer_data(run_dt, run_hour, project):
 
				     final_df = final_df.to_pandas()
			
 
				     final_df = final_df[['strategy_key', 'dt_version', 'gh_id', 'sort', 'video_id', 'title', 'cover_url']]
			
 
				 
			
 
				+    # reverse sending order
			
 
				+    final_df['sort'] = SEND_N + 1 - final_df['sort']
			
 
				+
			
 
				     # save to ODPS
			
 
				     t = odps_instance.get_table(ODPS_RANK_RESULT_TABLE)
			
 
				     part_spec_dict = {'dt': run_dt, 'hour': run_hour, 'ctime': dt_version}