# coding utf-8 import sys import traceback import math import pandas as pd from my_utils import send_msg_to_feishu from my_config import set_config from log import Log config_, _ = set_config() log_ = Log() features = [ 'apptype', 'code', 'videoid', 'lastonehour_preview', # 过去1小时预曝光人数 'lastonehour_view', # 过去1小时曝光人数 'lastonehour_play', # 过去1小时播放人数 'lastonehour_share', # 过去1小时分享人数 'lastonehour_return', # 过去1小时分享,过去1小时回流人数 'lastonehour_preview_total', # 过去1小时预曝光次数 'lastonehour_view_total', # 过去1小时曝光次数 'lastonehour_play_total', # 过去1小时播放次数 'lastonehour_share_total', # 过去1小时分享次数 'platform_return', 'lastonehour_show', # 不区分地域 'lastonehour_show_region', # 地域分组 'lasttwohour_share', # h-2小时分享人数 'lasttwohour_return_now', # h-2分享,过去1小时回流人数 'lasttwohour_return', # h-2分享,h-2回流人数 'lastthreehour_share', # h-3小时分享人数 'lastthreehour_return_now', # h-3分享,过去1小时回流人数 'lastthreehour_return', # h-3分享,h-3回流人数 'lastonehour_return_new', # 过去1小时分享,过去1小时回流人数(回流统计为对应地域分享带回的回流,分享限制地域,回流不限制地域) 'lasttwohour_return_now_new', # h-2分享,过去1小时回流人数(回流统计为对应地域分享带回的回流,分享限制地域,回流不限制地域) 'lasttwohour_return_new', # h-2分享,h-2回流人数(回流统计为对应地域分享带回的回流,分享限制地域,回流不限制地域) 'lastthreehour_return_now_new', # h-3分享,过去1小时回流人数(回流统计为对应地域分享带回的回流,分享限制地域,回流不限制地域) 'lastthreehour_return_new', # h-3分享,h-3回流人数(回流统计为对应地域分享带回的回流,分享限制地域,回流不限制地域) 'platform_return_new', # 平台分发回流(回流统计为对应地域分享带回的回流,分享限制地域,回流不限制地域) ] def data_group(data_path): """将数据按照videoid聚合(求和)""" f = open(data_path) index = 0 data_dict = {} while True: line = f.readline() if not line: break if index == 0: index += 1 continue index += 1 items = line.strip().split(",") # print(items) if len(items) < len(features): continue video_id = items[2] if video_id not in data_dict: data_dict[video_id] = {'videoid': video_id} for i, feature in enumerate(features): if feature in ['apptype', 'code', 'videoid']: continue data_dict[video_id][feature] = int(float(items[i])) else: for i, feature in enumerate(features): if feature in ['apptype', 'code', 'videoid']: continue data_dict[video_id][feature] = data_dict[video_id][feature] + int(float(items[i])) f.close() data_list = [item for video_id, item in data_dict.items()] data_df = pd.DataFrame(data_list) return data_df def cal_score(data_df): """计算score""" df = data_df.copy() # share_rate_view = (share+1)/(view+1000) df['share_rate_view'] = (df['lastonehour_share'] + 1) / (df['lastonehour_view'] + 1000) # back_rate = (return+1)/(share+10) df['back_rate'] = (df['lastonehour_return'] + 1) / (df['lastonehour_share'] + 10) # back_rate_2h = (lasttwohour_return_now+1)/(share+10) df['back_rate_2h'] = (df['lasttwohour_return_now'] + 1) / (df['lasttwohour_share'] + 10) # back_rate_3h = (lastthreehour_return_now+1)/(share+10) df['back_rate_3h'] = (df['lastthreehour_return_now'] + 1) / (df['lastthreehour_share'] + 10) df['log_back'] = (df['lastonehour_return'] + 1).apply(math.log) # h-2回流留存 df['return_retention_initial_2h'] = (df['lasttwohour_return_now'] + 1) / (df['lasttwohour_return'] + 5) df['return_retention_2h'] = df['return_retention_initial_2h'].apply(lambda x: 1 if x > 1 else x) # h-3回流留存 df['return_retention_initial_3h'] = (df['lastthreehour_return_now'] + 1) / (df['lastthreehour_return'] + 10) df['return_retention_3h'] = df['return_retention_initial_3h'].apply(lambda x: 0.8 if x > 0.8 else x) # score1 = 回流/(view+5) df['hour_score1'] = df['lastonehour_return'] / (df['lastonehour_view'] + 5) # score2 = (回流 * (1 + h-2回流留存 + h-3回流留存))/(view+1000) df['hour_score2'] = (df['lastonehour_return'] * (1 + df['return_retention_2h'] + df['return_retention_3h'])) / \ (df['lastonehour_view'] + 1000) # score3 = (lastthreehour_return_now + lasttwohour_return_now + lastonehour_return)/(lastonehour_view+1000) df['hour_score3'] = (df['lastthreehour_return_now'] + df['lasttwohour_return_now'] + df['lastonehour_return']) / \ (df['lastonehour_view'] + 1000) # score4 = share/view * back_rate * logback df['hour_score4'] = df['share_rate_view'] * df['back_rate'] * df['log_back'] # score5 = share/view * (back_rate + back_rate_2h + back_rate_3h) * logback df['hour_score5'] = df['share_rate_view'] * (df['back_rate'] + df['back_rate_2h'] + df['back_rate_3h']) * df['log_back'] # score6 = 回流/(view+5)*back_rate df['hour_score6'] = df['lastonehour_return'] / (df['lastonehour_view'] + 5) * df['back_rate'] score_df = df[['videoid', 'hour_score1', 'hour_score2', 'hour_score3', 'hour_score4', 'hour_score5', 'hour_score6']] # print(score_df) return score_df if __name__ == "__main__": try: now_date = sys.argv[1] log_.info(f"now_date: {now_date}") data_path = f"./data/hour_video_data_{now_date}.csv" data_df = data_group(data_path=data_path) log_.info(f"hour data_df shape: {data_df.shape}") hour_score_path = f"./data/hour_score_{now_date}.csv" score_df = cal_score(data_df=data_df) score_df.to_csv(hour_score_path, index=False) log_.info(f"hour score_df shape: {score_df.shape}") except Exception as e: log_.error(f"rank 小时级分值更新失败, exception: {e}, traceback: {traceback.format_exc()}") send_msg_to_feishu( webhook=config_.FEISHU_ROBOT['server_robot'].get('webhook'), key_word=config_.FEISHU_ROBOT['server_robot'].get('key_word'), msg_text=f"rov-offline{config_.ENV_TEXT} - rank 小时级分值更新失败\n" f"exception: {e}\n" f"traceback: {traceback.format_exc()}" )