|
@@ -35,6 +35,7 @@ ODPS_RANK_RESULT_TABLE = 'alg_gh_autoreply_video_rank_data'
|
|
|
RDS_RANK_RESULT_TABLE = 'alg_gh_autoreply_video_rank_data'
|
|
|
GH_DETAIL = 'gh_detail'
|
|
|
STATS_PERIOD_DAYS = 5
|
|
|
+STATS_PERIOD_DAYS_FOR_QUIT = 30
|
|
|
SEND_N = 2
|
|
|
|
|
|
pd.set_option('display.max_rows', None)
|
|
@@ -111,9 +112,37 @@ def process_reply_stats(project, daily_table, hourly_table, period, run_dt, run_
|
|
|
|
|
|
|
|
|
def rank_for_layer1(run_dt, run_hour, project, table, gh_df):
|
|
|
-
|
|
|
+
|
|
|
df = get_odps_df_of_max_partition(project, table, {'dt': run_dt})
|
|
|
df = df.to_pandas()
|
|
|
+
|
|
|
+
|
|
|
+ stats_df = get_odps_df_of_recent_partitions(
|
|
|
+ ODS_PROJECT, GH_REPLY_STATS_TABLE, 30, {'dt': run_dt}).to_pandas()
|
|
|
+ stats_df['video_id'] = stats_df['video_id'].astype('int64')
|
|
|
+ stats_df = stats_df[['video_id', 'send_count', 'first_visit_uv', 'day0_return']]
|
|
|
+ stats_df = stats_df.groupby(['video_id']).agg({
|
|
|
+ 'send_count': 'sum',
|
|
|
+ 'first_visit_uv': 'sum',
|
|
|
+ 'day0_return': 'sum'
|
|
|
+ })
|
|
|
+
|
|
|
+ stats_df['return_by_send'] = stats_df['day0_return'] / (stats_df['send_count'])
|
|
|
+ stats_df['open_rate'] = stats_df['first_visit_uv'] / (stats_df['send_count'])
|
|
|
+
|
|
|
+ stats_df = stats_df.query('send_count > 1000')
|
|
|
+
|
|
|
+ df = df.merge(stats_df, on='video_id', how='left')
|
|
|
+ open_rate_threshold = df.open_rate.quantile(q=0.2)
|
|
|
+ return_by_send_threshold = df.return_by_send.quantile(q=0.2)
|
|
|
+
|
|
|
+ filter_condition = 'open_rate < {} and return_by_send < {}' \
|
|
|
+ .format(open_rate_threshold, return_by_send_threshold)
|
|
|
+ filter_rows = df.query(filter_condition)
|
|
|
+ df = df.drop(filter_rows.index)
|
|
|
+ print("low-efficient video to quit:")
|
|
|
+ print(filter_rows[['video_id', 'title', 'send_count', 'open_rate', 'return_by_send']])
|
|
|
+
|
|
|
|
|
|
dt_version = f'{run_dt}{run_hour}'
|
|
|
np.random.seed(int(dt_version) + 1)
|