Browse Source

Update gh reply: remove unsafe video

StrayWarrior 4 months ago
parent
commit
ca54f93587
2 changed files with 13 additions and 12 deletions
  1. 7 7
      alg_growth_3rd_gh_reply_video_v1.py
  2. 6 5
      alg_growth_gh_reply_video_v1.py

+ 7 - 7
alg_growth_3rd_gh_reply_video_v1.py

@@ -18,6 +18,7 @@ from log import Log
 import os
 from argparse import ArgumentParser
 from constants import AutoReplyAccountType
+from alg_growth_common import check_unsafe_video, filter_unsafe_video
 
 CONFIG, _ = set_config()
 LOGGER = Log()
@@ -41,9 +42,6 @@ RDS_RANK_RESULT_TABLE = 'alg_gh_autoreply_video_rank_data'
 STATS_PERIOD_DAYS = 5
 SEND_N = 1
 
-unsafe_videos = [13817005, 14403867]
-unsafe_video_condition = ','.join([str(x) for x in unsafe_videos])
-
 def get_and_update_gh_ids(run_dt):
     db = MysqlHelper(CONFIG.MYSQL_GROWTH_INFO)
     gh_type = AutoReplyAccountType.EXTERNAL_GZH.value
@@ -98,7 +96,9 @@ def process_reply_stats(project, table, period, run_dt):
 
     df['video_id'] = df['video_id'].astype('int64')
     df = df[['gh_id', 'video_id', 'send_count', 'first_visit_uv', 'day0_return']]
-    df = df.query(f'video_id not in ({unsafe_video_condition})')
+
+    # 获取统计数据时统一去除不安全视频
+    df = filter_unsafe_video(df)
 
     # 账号内聚合
     df = df.groupby(['video_id', 'gh_id']).agg({
@@ -125,11 +125,10 @@ def rank_for_layer1(run_dt, run_hour, project, table, gh):
     # TODO: 加审核&退场
     df = get_odps_df_of_max_partition(project, table, {'dt': run_dt})
     df = df.to_pandas()
+    df = filter_unsafe_video(df)
     # 确保重跑时可获得一致结果
     dt_version = f'{run_dt}{run_hour}'
     np.random.seed(int(dt_version) + 1)
-    df = df.query(f'video_id not in ({unsafe_video_condition})')
-    print(df)
 
     # TODO: 修改权重计算策略
     df['score'] = 1.0
@@ -180,7 +179,7 @@ def rank_for_layer2(run_dt, run_hour, project, stats_table, rank_table):
             LOGGER.warning(
                 "gh_id[{}] rows[{}] not enough for layer2, fallback to base"
                 .format(gh_id, len(sub_df)))
-            sub_df = base_strategy_df.query(f'gh_id == "{gh_id}"')
+            sub_df = base_strategy_df.query(f'gh_id == "{gh_id}"').copy()
             sub_df['score'] = sub_df['sort']
         sampled_df = sub_df.sample(n=SEND_N, weights=sub_df['score'])
         sampled_df['sort'] = range(1, len(sampled_df) + 1)
@@ -237,6 +236,7 @@ def rank_for_base(run_dt, run_hour, project, stats_table, rank_table, stg_key):
 
 
 def check_result_data(df):
+    check_unsafe_video(df)
     for gh_id in GH_IDS:
         for key in (EXPLORE1_GROUP_NAME, EXPLORE2_GROUP_NAME, BASE_GROUP_NAME):
             sub_df = df.query(f'gh_id == "{gh_id}" and strategy_key == "{key}"')

+ 6 - 5
alg_growth_gh_reply_video_v1.py

@@ -16,6 +16,7 @@ from log import Log
 import os
 from argparse import ArgumentParser
 from constants import AutoReplyAccountType
+from alg_growth_common import check_unsafe_video, filter_unsafe_video
 
 CONFIG, _ = set_config()
 LOGGER = Log()
@@ -38,9 +39,6 @@ STATS_PERIOD_DAYS = 5
 STATS_PERIOD_DAYS_FOR_QUIT = 30
 SEND_N = 2
 
-unsafe_videos = [13817005, 14403867]
-unsafe_video_condition = ','.join([str(x) for x in unsafe_videos])
-
 pd.set_option('display.max_rows', None)
 
 def get_and_update_gh_ids(run_dt):
@@ -93,7 +91,9 @@ def process_reply_stats(project, daily_table, hourly_table, period, run_dt, run_
 
     df['video_id'] = df['video_id'].astype('int64')
     df = df[['gh_id', 'video_id', 'send_count', 'first_visit_uv', 'day0_return']]
-    df = df.query(f'video_id not in ({unsafe_video_condition})')
+
+    # 获取统计数据时统一去除不安全视频
+    df = filter_unsafe_video(df)
 
     # 账号内聚合
     df = df.groupby(['video_id', 'gh_id']).agg({
@@ -147,7 +147,7 @@ def rank_for_layer1(run_dt, run_hour, project, table, gh_df):
     df = df.drop(filter_rows.index)
     print("low-efficient video to quit:")
     print(filter_rows[['video_id', 'title', 'send_count', 'open_rate', 'return_by_send']])
-    df = df.query(f'video_id not in ({unsafe_video_condition})')
+    df = filter_unsafe_video(df)
 
     # 确保重跑时可获得一致结果
     dt_version = f'{run_dt}{run_hour}'
@@ -257,6 +257,7 @@ def rank_for_base(run_dt, run_hour, rank_table):
 
 
 def check_result_data(df):
+    check_unsafe_video(df, False)
     for gh_id in GH_IDS:
         for key in (EXPLORE1_GROUP_NAME, EXPLORE2_GROUP_NAME, BASE_GROUP_NAME):
             sub_df = df.query(f'gh_id == "{gh_id}" and strategy_key == "{key}"')