Преглед на файлове

Update alg_growth: remove unsafe videos

StrayWarrior преди 4 месеца
родител
ревизия
16ccb1c1fe
променени са 2 файла, в които са добавени 14 реда и са изтрити 0 реда
  1. 9 0
      alg_growth_3rd_gh_reply_video_v1.py
  2. 5 0
      alg_growth_gh_reply_video_v1.py

+ 9 - 0
alg_growth_3rd_gh_reply_video_v1.py

@@ -41,6 +41,9 @@ RDS_RANK_RESULT_TABLE = 'alg_gh_autoreply_video_rank_data'
 STATS_PERIOD_DAYS = 5
 SEND_N = 1
 
+unsafe_videos = [13817005, 14403867]
+unsafe_video_condition = ','.join([str(x) for x in unsafe_videos])
+
 def get_and_update_gh_ids(run_dt):
     db = MysqlHelper(CONFIG.MYSQL_GROWTH_INFO)
     gh_type = AutoReplyAccountType.EXTERNAL_GZH.value
@@ -95,6 +98,7 @@ def process_reply_stats(project, table, period, run_dt):
 
     df['video_id'] = df['video_id'].astype('int64')
     df = df[['gh_id', 'video_id', 'send_count', 'first_visit_uv', 'day0_return']]
+    df = df.query(f'video_id not in ({unsafe_video_condition})')
 
     # 账号内聚合
     df = df.groupby(['video_id', 'gh_id']).agg({
@@ -124,6 +128,8 @@ def rank_for_layer1(run_dt, run_hour, project, table, gh):
     # 确保重跑时可获得一致结果
     dt_version = f'{run_dt}{run_hour}'
     np.random.seed(int(dt_version) + 1)
+    df = df.query(f'video_id not in ({unsafe_video_condition})')
+    print(df)
 
     # TODO: 修改权重计算策略
     df['score'] = 1.0
@@ -272,6 +278,9 @@ def postprocess_override_by_config(df, gh_df, dt_version):
     override_data['score'] = [0.0] * n_records
     df_to_append = pd.DataFrame(override_data)
     df = pd.concat([df, df_to_append], ignore_index=True)
+    # 强制更换不安全视频
+    idx = df[df['video_id'] == 14403867].index
+    df.loc[idx, 'video_id'] = 20463342
     return df
 
 def build_and_transfer_base_mode(gh_df, run_dt, run_hour, dt_version, dry_run):

+ 5 - 0
alg_growth_gh_reply_video_v1.py

@@ -38,6 +38,9 @@ STATS_PERIOD_DAYS = 5
 STATS_PERIOD_DAYS_FOR_QUIT = 30
 SEND_N = 2
 
+unsafe_videos = [13817005, 14403867]
+unsafe_video_condition = ','.join([str(x) for x in unsafe_videos])
+
 pd.set_option('display.max_rows', None)
 
 def get_and_update_gh_ids(run_dt):
@@ -90,6 +93,7 @@ def process_reply_stats(project, daily_table, hourly_table, period, run_dt, run_
 
     df['video_id'] = df['video_id'].astype('int64')
     df = df[['gh_id', 'video_id', 'send_count', 'first_visit_uv', 'day0_return']]
+    df = df.query(f'video_id not in ({unsafe_video_condition})')
 
     # 账号内聚合
     df = df.groupby(['video_id', 'gh_id']).agg({
@@ -143,6 +147,7 @@ def rank_for_layer1(run_dt, run_hour, project, table, gh_df):
     df = df.drop(filter_rows.index)
     print("low-efficient video to quit:")
     print(filter_rows[['video_id', 'title', 'send_count', 'open_rate', 'return_by_send']])
+    df = df.query(f'video_id not in ({unsafe_video_condition})')
 
     # 确保重跑时可获得一致结果
     dt_version = f'{run_dt}{run_hour}'