123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146 |
- import sys
- import traceback
- import math
- import pandas as pd
- from utils import send_msg_to_feishu
- from config import set_config
- from log import Log
- config_, _ = set_config()
- log_ = Log()
- features = [
- 'apptype',
- 'code',
- 'videoid',
- 'lastonehour_preview',
- 'lastonehour_view',
- 'lastonehour_play',
- 'lastonehour_share',
- 'lastonehour_return',
- 'lastonehour_preview_total',
- 'lastonehour_view_total',
- 'lastonehour_play_total',
- 'lastonehour_share_total',
- 'platform_return',
- 'lastonehour_show',
- 'lastonehour_show_region',
- 'lasttwohour_share',
- 'lasttwohour_return_now',
- 'lasttwohour_return',
- 'lastthreehour_share',
- 'lastthreehour_return_now',
- 'lastthreehour_return',
- 'lastonehour_return_new',
- 'lasttwohour_return_now_new',
- 'lasttwohour_return_new',
- 'lastthreehour_return_now_new',
- 'lastthreehour_return_new',
- 'platform_return_new',
- ]
- def data_group(data_path):
- """将数据按照videoid聚合(求和)"""
- f = open(data_path)
- index = 0
- data_dict = {}
- while True:
- line = f.readline()
- if not line:
- break
- if index == 0:
- index += 1
- continue
- index += 1
- items = line.strip().split(",")
-
- if len(items) < len(features):
- continue
- video_id = items[2]
- if video_id not in data_dict:
- data_dict[video_id] = {'videoid': video_id}
- for i, feature in enumerate(features):
- if feature in ['apptype', 'code', 'videoid']:
- continue
- data_dict[video_id][feature] = int(float(items[i]))
- else:
- for i, feature in enumerate(features):
- if feature in ['apptype', 'code', 'videoid']:
- continue
- data_dict[video_id][feature] = data_dict[video_id][feature] + int(float(items[i]))
- f.close()
- data_list = [item for video_id, item in data_dict.items()]
- data_df = pd.DataFrame(data_list)
- return data_df
- def cal_score(data_df):
- """计算score"""
- df = data_df.copy()
-
- df['share_rate_view'] = (df['lastonehour_share'] + 1) / (df['lastonehour_view'] + 1000)
-
- df['back_rate'] = (df['lastonehour_return'] + 1) / (df['lastonehour_share'] + 10)
-
- df['back_rate_2h'] = (df['lasttwohour_return_now'] + 1) / (df['lasttwohour_share'] + 10)
-
- df['back_rate_3h'] = (df['lastthreehour_return_now'] + 1) / (df['lastthreehour_share'] + 10)
- df['log_back'] = (df['lastonehour_return'] + 1).apply(math.log)
-
- df['return_retention_initial_2h'] = (df['lasttwohour_return_now'] + 1) / (df['lasttwohour_return'] + 5)
- df['return_retention_2h'] = df['return_retention_initial_2h'].apply(lambda x: 1 if x > 1 else x)
-
- df['return_retention_initial_3h'] = (df['lastthreehour_return_now'] + 1) / (df['lastthreehour_return'] + 10)
- df['return_retention_3h'] = df['return_retention_initial_3h'].apply(lambda x: 0.8 if x > 0.8 else x)
-
- df['hour_score1'] = df['lastonehour_return'] / (df['lastonehour_view'] + 5)
-
- df['hour_score2'] = (df['lastonehour_return'] * (1 + df['return_retention_2h'] + df['return_retention_3h'])) / \
- (df['lastonehour_view'] + 1000)
-
- df['hour_score3'] = (df['lastthreehour_return_now'] + df['lasttwohour_return_now'] + df['lastonehour_return']) / \
- (df['lastonehour_view'] + 1000)
-
- df['hour_score4'] = df['share_rate_view'] * df['back_rate'] * df['log_back']
-
- df['hour_score5'] = df['share_rate_view'] * (df['back_rate'] + df['back_rate_2h'] + df['back_rate_3h']) * df['log_back']
-
- df['hour_score6'] = df['lastonehour_return'] / (df['lastonehour_view'] + 5) * df['back_rate']
- score_df = df[['videoid', 'hour_score1', 'hour_score2', 'hour_score3', 'hour_score4', 'hour_score5', 'hour_score6']]
-
- return score_df
- if __name__ == "__main__":
- try:
- now_date = sys.argv[1]
- log_.info(f"now_date: {now_date}")
- data_path = f"./data/hour_video_data_{now_date}.csv"
- data_df = data_group(data_path=data_path)
- log_.info(f"hour data_df shape: {data_df.shape}")
- hour_score_path = f"./data/hour_score_{now_date}.csv"
- score_df = cal_score(data_df=data_df)
- score_df.to_csv(hour_score_path, index=False)
- log_.info(f"hour score_df shape: {score_df.shape}")
- except Exception as e:
- log_.error(f"rank 小时级分值更新失败, exception: {e}, traceback: {traceback.format_exc()}")
- send_msg_to_feishu(
- webhook=config_.FEISHU_ROBOT['server_robot'].get('webhook'),
- key_word=config_.FEISHU_ROBOT['server_robot'].get('key_word'),
- msg_text=f"rov-offline{config_.ENV_TEXT} - rank 小时级分值更新失败\n"
- f"exception: {e}\n"
- f"traceback: {traceback.format_exc()}"
- )
|