|
@@ -18,6 +18,7 @@ from log import Log
|
|
|
import os
|
|
|
from argparse import ArgumentParser
|
|
|
from constants import AutoReplyAccountType
|
|
|
+from alg_growth_common import check_unsafe_video, filter_unsafe_video
|
|
|
|
|
|
CONFIG, _ = set_config()
|
|
|
LOGGER = Log()
|
|
@@ -41,9 +42,6 @@ RDS_RANK_RESULT_TABLE = 'alg_gh_autoreply_video_rank_data'
|
|
|
STATS_PERIOD_DAYS = 5
|
|
|
SEND_N = 1
|
|
|
|
|
|
-unsafe_videos = [13817005, 14403867]
|
|
|
-unsafe_video_condition = ','.join([str(x) for x in unsafe_videos])
|
|
|
-
|
|
|
def get_and_update_gh_ids(run_dt):
|
|
|
db = MysqlHelper(CONFIG.MYSQL_GROWTH_INFO)
|
|
|
gh_type = AutoReplyAccountType.EXTERNAL_GZH.value
|
|
@@ -98,7 +96,9 @@ def process_reply_stats(project, table, period, run_dt):
|
|
|
|
|
|
df['video_id'] = df['video_id'].astype('int64')
|
|
|
df = df[['gh_id', 'video_id', 'send_count', 'first_visit_uv', 'day0_return']]
|
|
|
- df = df.query(f'video_id not in ({unsafe_video_condition})')
|
|
|
+
|
|
|
+ # 获取统计数据时统一去除不安全视频
|
|
|
+ df = filter_unsafe_video(df)
|
|
|
|
|
|
# 账号内聚合
|
|
|
df = df.groupby(['video_id', 'gh_id']).agg({
|
|
@@ -125,11 +125,10 @@ def rank_for_layer1(run_dt, run_hour, project, table, gh):
|
|
|
# TODO: 加审核&退场
|
|
|
df = get_odps_df_of_max_partition(project, table, {'dt': run_dt})
|
|
|
df = df.to_pandas()
|
|
|
+ df = filter_unsafe_video(df)
|
|
|
# 确保重跑时可获得一致结果
|
|
|
dt_version = f'{run_dt}{run_hour}'
|
|
|
np.random.seed(int(dt_version) + 1)
|
|
|
- df = df.query(f'video_id not in ({unsafe_video_condition})')
|
|
|
- print(df)
|
|
|
|
|
|
# TODO: 修改权重计算策略
|
|
|
df['score'] = 1.0
|
|
@@ -180,7 +179,7 @@ def rank_for_layer2(run_dt, run_hour, project, stats_table, rank_table):
|
|
|
LOGGER.warning(
|
|
|
"gh_id[{}] rows[{}] not enough for layer2, fallback to base"
|
|
|
.format(gh_id, len(sub_df)))
|
|
|
- sub_df = base_strategy_df.query(f'gh_id == "{gh_id}"')
|
|
|
+ sub_df = base_strategy_df.query(f'gh_id == "{gh_id}"').copy()
|
|
|
sub_df['score'] = sub_df['sort']
|
|
|
sampled_df = sub_df.sample(n=SEND_N, weights=sub_df['score'])
|
|
|
sampled_df['sort'] = range(1, len(sampled_df) + 1)
|
|
@@ -237,6 +236,7 @@ def rank_for_base(run_dt, run_hour, project, stats_table, rank_table, stg_key):
|
|
|
|
|
|
|
|
|
def check_result_data(df):
|
|
|
+ check_unsafe_video(df)
|
|
|
for gh_id in GH_IDS:
|
|
|
for key in (EXPLORE1_GROUP_NAME, EXPLORE2_GROUP_NAME, BASE_GROUP_NAME):
|
|
|
sub_df = df.query(f'gh_id == "{gh_id}" and strategy_key == "{key}"')
|