|
@@ -31,6 +31,7 @@ GH_REPLY_STATS_TABLE = 'alg_growth_gh_reply_video_stats'
|
|
|
ODPS_RANK_RESULT_TABLE = 'alg_gh_autoreply_video_rank_data'
|
|
|
RDS_RANK_RESULT_TABLE = 'alg_gh_autoreply_video_rank_data'
|
|
|
STATS_PERIOD_DAYS = 3
|
|
|
+SEND_N = 2
|
|
|
|
|
|
def check_data_partition(project, table, data_dt, data_hr=None):
|
|
|
"""检查数据是否准备好"""
|
|
@@ -85,7 +86,7 @@ def rank_for_layer1(run_dt, run_hour, project, table):
|
|
|
# TODO: 修改权重计算策略
|
|
|
sample_weights = df['rov']
|
|
|
|
|
|
- sampled_df = df.sample(n=2, weights=sample_weights)
|
|
|
+ sampled_df = df.sample(n=SEND_N, weights=sample_weights)
|
|
|
sampled_df['sort'] = range(1, len(sampled_df) + 1)
|
|
|
sampled_df['strategy_key'] = EXPLORE1_GROUP_NAME
|
|
|
sampled_df['dt_version'] = dt_version
|
|
@@ -113,7 +114,7 @@ def rank_for_layer2(run_dt, run_hour, project, table):
|
|
|
sampled_dfs = []
|
|
|
# 处理default逻辑(default-explore2)
|
|
|
default_stats_df = stats_df.query('gh_id == "default"')
|
|
|
- sampled_df = default_stats_df.sample(n=2, weights=default_stats_df['score'])
|
|
|
+ sampled_df = default_stats_df.sample(n=SEND_N, weights=default_stats_df['score'])
|
|
|
sampled_df['sort'] = range(1, len(sampled_df) + 1)
|
|
|
sampled_dfs.append(sampled_df)
|
|
|
|
|
@@ -123,10 +124,10 @@ def rank_for_layer2(run_dt, run_hour, project, table):
|
|
|
# TODO: 个数不足时的兜底逻辑
|
|
|
for gh_id in GH_IDS:
|
|
|
sub_df = df.query(f'gh_id == "{gh_id}"')
|
|
|
- sampled_df = sub_df.sample(n=2, weights=sub_df['score'])
|
|
|
+ sampled_df = sub_df.sample(n=SEND_N, weights=sub_df['score'])
|
|
|
sampled_df['sort'] = range(1, len(sampled_df) + 1)
|
|
|
sampled_dfs.append(sampled_df)
|
|
|
- if len(sampled_df) != 2:
|
|
|
+ if len(sampled_df) != SEND_N:
|
|
|
raise
|
|
|
|
|
|
extend_df = pd.concat(sampled_dfs)
|
|
@@ -170,7 +171,7 @@ def rank_for_base(run_dt, run_hour, project, stats_table, rank_table):
|
|
|
top_n = group_sorted.head(n)
|
|
|
top_n['sort'] = range(1, n + 1)
|
|
|
return top_n
|
|
|
- ranked_df = grouped_stats_df.groupby('gh_id').apply(set_top_n)
|
|
|
+ ranked_df = grouped_stats_df.groupby('gh_id').apply(set_top_n, SEND_N)
|
|
|
ranked_df = ranked_df.reset_index(drop=True)
|
|
|
#ranked_df['sort'] = grouped_stats_df.groupby('gh_id')['score'].rank(ascending=False)
|
|
|
ranked_df['strategy_key'] = BASE_GROUP_NAME
|
|
@@ -199,6 +200,9 @@ def build_and_transfer_data(run_dt, run_hour, project):
|
|
|
final_df = final_df.to_pandas()
|
|
|
final_df = final_df[['strategy_key', 'dt_version', 'gh_id', 'sort', 'video_id', 'title', 'cover_url']]
|
|
|
|
|
|
+ # reverse sending order
|
|
|
+ final_df['sort'] = SEND_N + 1 - final_df['sort']
|
|
|
+
|
|
|
# save to ODPS
|
|
|
t = odps_instance.get_table(ODPS_RANK_RESULT_TABLE)
|
|
|
part_spec_dict = {'dt': run_dt, 'hour': run_hour, 'ctime': dt_version}
|