7 月之前 · ca54f93587
--- a/alg_growth_3rd_gh_reply_video_v1.py
+++ b/alg_growth_3rd_gh_reply_video_v1.py
@@ -18,6 +18,7 @@ from log import Log
 
				 import os
			
 
				 from argparse import ArgumentParser
			
 
				 from constants import AutoReplyAccountType
			
 
				+from alg_growth_common import check_unsafe_video, filter_unsafe_video
			
 
				 
			
 
				 CONFIG, _ = set_config()
			
 
				 LOGGER = Log()
			
@@ -41,9 +42,6 @@ RDS_RANK_RESULT_TABLE = 'alg_gh_autoreply_video_rank_data'
 
				 STATS_PERIOD_DAYS = 5
			
 
				 SEND_N = 1
			
 
				 
			
 
				-unsafe_videos = [13817005, 14403867]
			
 
				-unsafe_video_condition = ','.join([str(x) for x in unsafe_videos])
			
 
				-
			
 
				 def get_and_update_gh_ids(run_dt):
			
 
				     db = MysqlHelper(CONFIG.MYSQL_GROWTH_INFO)
			
 
				     gh_type = AutoReplyAccountType.EXTERNAL_GZH.value
			
@@ -98,7 +96,9 @@ def process_reply_stats(project, table, period, run_dt):
 
				 
			
 
				     df['video_id'] = df['video_id'].astype('int64')
			
 
				     df = df[['gh_id', 'video_id', 'send_count', 'first_visit_uv', 'day0_return']]
			
 
				-    df = df.query(f'video_id not in ({unsafe_video_condition})')
			
 
				+
			
 
				+    # 获取统计数据时统一去除不安全视频
			
 
				+    df = filter_unsafe_video(df)
			
 
				 
			
 
				     # 账号内聚合
			
 
				     df = df.groupby(['video_id', 'gh_id']).agg({
			
@@ -125,11 +125,10 @@ def rank_for_layer1(run_dt, run_hour, project, table, gh):
 
				     # TODO: 加审核&退场
			
 
				     df = get_odps_df_of_max_partition(project, table, {'dt': run_dt})
			
 
				     df = df.to_pandas()
			
 
				+    df = filter_unsafe_video(df)
			
 
				     # 确保重跑时可获得一致结果
			
 
				     dt_version = f'{run_dt}{run_hour}'
			
 
				     np.random.seed(int(dt_version) + 1)
			
 
				-    df = df.query(f'video_id not in ({unsafe_video_condition})')
			
 
				-    print(df)
			
 
				 
			
 
				     # TODO: 修改权重计算策略
			
 
				     df['score'] = 1.0
			
@@ -180,7 +179,7 @@ def rank_for_layer2(run_dt, run_hour, project, stats_table, rank_table):
 
				             LOGGER.warning(
			
 
				                 "gh_id[{}] rows[{}] not enough for layer2, fallback to base"
			
 
				                 .format(gh_id, len(sub_df)))
			
 
				-            sub_df = base_strategy_df.query(f'gh_id == "{gh_id}"')
			
 
				+            sub_df = base_strategy_df.query(f'gh_id == "{gh_id}"').copy()
			
 
				             sub_df['score'] = sub_df['sort']
			
 
				         sampled_df = sub_df.sample(n=SEND_N, weights=sub_df['score'])
			
 
				         sampled_df['sort'] = range(1, len(sampled_df) + 1)
			
@@ -237,6 +236,7 @@ def rank_for_base(run_dt, run_hour, project, stats_table, rank_table, stg_key):
 
				 
			
 
				 
			
 
				 def check_result_data(df):
			
 
				+    check_unsafe_video(df)
			
 
				     for gh_id in GH_IDS:
			
 
				         for key in (EXPLORE1_GROUP_NAME, EXPLORE2_GROUP_NAME, BASE_GROUP_NAME):
			
 
				             sub_df = df.query(f'gh_id == "{gh_id}" and strategy_key == "{key}"')
			
--- a/alg_growth_gh_reply_video_v1.py
+++ b/alg_growth_gh_reply_video_v1.py
@@ -16,6 +16,7 @@ from log import Log
 
				 import os
			
 
				 from argparse import ArgumentParser
			
 
				 from constants import AutoReplyAccountType
			
 
				+from alg_growth_common import check_unsafe_video, filter_unsafe_video
			
 
				 
			
 
				 CONFIG, _ = set_config()
			
 
				 LOGGER = Log()
			
@@ -38,9 +39,6 @@ STATS_PERIOD_DAYS = 5
 
				 STATS_PERIOD_DAYS_FOR_QUIT = 30
			
 
				 SEND_N = 2
			
 
				 
			
 
				-unsafe_videos = [13817005, 14403867]
			
 
				-unsafe_video_condition = ','.join([str(x) for x in unsafe_videos])
			
 
				-
			
 
				 pd.set_option('display.max_rows', None)
			
 
				 
			
 
				 def get_and_update_gh_ids(run_dt):
			
@@ -93,7 +91,9 @@ def process_reply_stats(project, daily_table, hourly_table, period, run_dt, run_
 
				 
			
 
				     df['video_id'] = df['video_id'].astype('int64')
			
 
				     df = df[['gh_id', 'video_id', 'send_count', 'first_visit_uv', 'day0_return']]
			
 
				-    df = df.query(f'video_id not in ({unsafe_video_condition})')
			
 
				+
			
 
				+    # 获取统计数据时统一去除不安全视频
			
 
				+    df = filter_unsafe_video(df)
			
 
				 
			
 
				     # 账号内聚合
			
 
				     df = df.groupby(['video_id', 'gh_id']).agg({
			
@@ -147,7 +147,7 @@ def rank_for_layer1(run_dt, run_hour, project, table, gh_df):
 
				     df = df.drop(filter_rows.index)
			
 
				     print("low-efficient video to quit:")
			
 
				     print(filter_rows[['video_id', 'title', 'send_count', 'open_rate', 'return_by_send']])
			
 
				-    df = df.query(f'video_id not in ({unsafe_video_condition})')
			
 
				+    df = filter_unsafe_video(df)
			
 
				 
			
 
				     # 确保重跑时可获得一致结果
			
 
				     dt_version = f'{run_dt}{run_hour}'
			
@@ -257,6 +257,7 @@ def rank_for_base(run_dt, run_hour, rank_table):
 
				 
			
 
				 
			
 
				 def check_result_data(df):
			
 
				+    check_unsafe_video(df, False)
			
 
				     for gh_id in GH_IDS:
			
 
				         for key in (EXPLORE1_GROUP_NAME, EXPLORE2_GROUP_NAME, BASE_GROUP_NAME):
			
 
				             sub_df = df.query(f'gh_id == "{gh_id}" and strategy_key == "{key}"')