|  | @@ -0,0 +1,129 @@
 | 
											
												
													
														|  | 
 |  | +# coding utf-8
 | 
											
												
													
														|  | 
 |  | +import sys
 | 
											
												
													
														|  | 
 |  | +import json
 | 
											
												
													
														|  | 
 |  | +import math
 | 
											
												
													
														|  | 
 |  | +import pandas as pd
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +features = [
 | 
											
												
													
														|  | 
 |  | +    'apptype',
 | 
											
												
													
														|  | 
 |  | +    'code',
 | 
											
												
													
														|  | 
 |  | +    'videoid',
 | 
											
												
													
														|  | 
 |  | +    'lastonehour_preview',  # 过去1小时预曝光人数
 | 
											
												
													
														|  | 
 |  | +    'lastonehour_view',  # 过去1小时曝光人数
 | 
											
												
													
														|  | 
 |  | +    'lastonehour_play',  # 过去1小时播放人数
 | 
											
												
													
														|  | 
 |  | +    'lastonehour_share',  # 过去1小时分享人数
 | 
											
												
													
														|  | 
 |  | +    'lastonehour_return',  # 过去1小时分享,过去1小时回流人数
 | 
											
												
													
														|  | 
 |  | +    'lastonehour_preview_total',  # 过去1小时预曝光次数
 | 
											
												
													
														|  | 
 |  | +    'lastonehour_view_total',  # 过去1小时曝光次数
 | 
											
												
													
														|  | 
 |  | +    'lastonehour_play_total',  # 过去1小时播放次数
 | 
											
												
													
														|  | 
 |  | +    'lastonehour_share_total',  # 过去1小时分享次数
 | 
											
												
													
														|  | 
 |  | +    'platform_return',
 | 
											
												
													
														|  | 
 |  | +    'lastonehour_show',  # 不区分地域
 | 
											
												
													
														|  | 
 |  | +    'lastonehour_show_region',  # 地域分组
 | 
											
												
													
														|  | 
 |  | +    'lasttwohour_share',  # h-2小时分享人数
 | 
											
												
													
														|  | 
 |  | +    'lasttwohour_return_now',  # h-2分享,过去1小时回流人数
 | 
											
												
													
														|  | 
 |  | +    'lasttwohour_return',  # h-2分享,h-2回流人数
 | 
											
												
													
														|  | 
 |  | +    'lastthreehour_share',  # h-3小时分享人数
 | 
											
												
													
														|  | 
 |  | +    'lastthreehour_return_now',  # h-3分享,过去1小时回流人数
 | 
											
												
													
														|  | 
 |  | +    'lastthreehour_return',  # h-3分享,h-3回流人数
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    'lastonehour_return_new',  # 过去1小时分享,过去1小时回流人数(回流统计为对应地域分享带回的回流,分享限制地域,回流不限制地域)
 | 
											
												
													
														|  | 
 |  | +    'lasttwohour_return_now_new',  # h-2分享,过去1小时回流人数(回流统计为对应地域分享带回的回流,分享限制地域,回流不限制地域)
 | 
											
												
													
														|  | 
 |  | +    'lasttwohour_return_new',  # h-2分享,h-2回流人数(回流统计为对应地域分享带回的回流,分享限制地域,回流不限制地域)
 | 
											
												
													
														|  | 
 |  | +    'lastthreehour_return_now_new',  # h-3分享,过去1小时回流人数(回流统计为对应地域分享带回的回流,分享限制地域,回流不限制地域)
 | 
											
												
													
														|  | 
 |  | +    'lastthreehour_return_new',  # h-3分享,h-3回流人数(回流统计为对应地域分享带回的回流,分享限制地域,回流不限制地域)
 | 
											
												
													
														|  | 
 |  | +    'platform_return_new',  # 平台分发回流(回流统计为对应地域分享带回的回流,分享限制地域,回流不限制地域)
 | 
											
												
													
														|  | 
 |  | +]
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +def data_group(data_path):
 | 
											
												
													
														|  | 
 |  | +    """将数据按照videoid聚合(求和)"""
 | 
											
												
													
														|  | 
 |  | +    f = open(data_path)
 | 
											
												
													
														|  | 
 |  | +    index = 0
 | 
											
												
													
														|  | 
 |  | +    data_dict = {}
 | 
											
												
													
														|  | 
 |  | +    while True:
 | 
											
												
													
														|  | 
 |  | +        line = f.readline()
 | 
											
												
													
														|  | 
 |  | +        if not line:
 | 
											
												
													
														|  | 
 |  | +            break
 | 
											
												
													
														|  | 
 |  | +        if index == 0:
 | 
											
												
													
														|  | 
 |  | +            index += 1
 | 
											
												
													
														|  | 
 |  | +            continue
 | 
											
												
													
														|  | 
 |  | +        index += 1
 | 
											
												
													
														|  | 
 |  | +        items = line.strip().split(",")
 | 
											
												
													
														|  | 
 |  | +        # print(items)
 | 
											
												
													
														|  | 
 |  | +        if len(items) < len(features):
 | 
											
												
													
														|  | 
 |  | +            continue
 | 
											
												
													
														|  | 
 |  | +        video_id = items[2]
 | 
											
												
													
														|  | 
 |  | +        if video_id not in data_dict:
 | 
											
												
													
														|  | 
 |  | +            data_dict[video_id] = {'videoid': video_id}
 | 
											
												
													
														|  | 
 |  | +            for i, feature in enumerate(features):
 | 
											
												
													
														|  | 
 |  | +                if feature in ['apptype', 'code', 'videoid']:
 | 
											
												
													
														|  | 
 |  | +                    continue
 | 
											
												
													
														|  | 
 |  | +                data_dict[video_id][feature] = int(float(items[i]))
 | 
											
												
													
														|  | 
 |  | +        else:
 | 
											
												
													
														|  | 
 |  | +            for i, feature in enumerate(features):
 | 
											
												
													
														|  | 
 |  | +                if feature in ['apptype', 'code', 'videoid']:
 | 
											
												
													
														|  | 
 |  | +                    continue
 | 
											
												
													
														|  | 
 |  | +                data_dict[video_id][feature] = data_dict[video_id][feature] + int(float(items[i]))
 | 
											
												
													
														|  | 
 |  | +    f.close()
 | 
											
												
													
														|  | 
 |  | +    data_list = [item for video_id, item in data_dict.items()]
 | 
											
												
													
														|  | 
 |  | +    data_df = pd.DataFrame(data_list)
 | 
											
												
													
														|  | 
 |  | +    return data_df
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +def cal_score(data_df):
 | 
											
												
													
														|  | 
 |  | +    """计算score"""
 | 
											
												
													
														|  | 
 |  | +    df = data_df.copy()
 | 
											
												
													
														|  | 
 |  | +    # share_rate_view = (share+1)/(view+1000)
 | 
											
												
													
														|  | 
 |  | +    df['share_rate_view'] = (df['lastonehour_share'] + 1) / (df['lastonehour_view'] + 1000)
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    # back_rate = (return+1)/(share+10)
 | 
											
												
													
														|  | 
 |  | +    df['back_rate'] = (df['lastonehour_return'] + 1) / (df['lastonehour_share'] + 10)
 | 
											
												
													
														|  | 
 |  | +    # back_rate_2h = (lasttwohour_return_now+1)/(share+10)
 | 
											
												
													
														|  | 
 |  | +    df['back_rate_2h'] = (df['lasttwohour_return_now'] + 1) / (df['lasttwohour_share'] + 10)
 | 
											
												
													
														|  | 
 |  | +    # back_rate_3h = (lastthreehour_return_now+1)/(share+10)
 | 
											
												
													
														|  | 
 |  | +    df['back_rate_3h'] = (df['lastthreehour_return_now'] + 1) / (df['lastthreehour_share'] + 10)
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    df['log_back'] = (df['lastonehour_return'] + 1).apply(math.log)
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    # h-2回流留存
 | 
											
												
													
														|  | 
 |  | +    df['return_retention_initial_2h'] = (df['lasttwohour_return_now'] + 1) / (df['lasttwohour_return'] + 5)
 | 
											
												
													
														|  | 
 |  | +    df['return_retention_2h'] = df['return_retention_initial_2h'].apply(lambda x: 1 if x > 1 else x)
 | 
											
												
													
														|  | 
 |  | +    # h-3回流留存
 | 
											
												
													
														|  | 
 |  | +    df['return_retention_initial_3h'] = (df['lastthreehour_return_now'] + 1) / (df['lastthreehour_return'] + 10)
 | 
											
												
													
														|  | 
 |  | +    df['return_retention_3h'] = df['return_retention_initial_3h'].apply(lambda x: 0.8 if x > 0.8 else x)
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    # score1 = 回流/(view+5)
 | 
											
												
													
														|  | 
 |  | +    df['hour_score1'] = df['lastonehour_return'] / (df['lastonehour_view'] + 5)
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    # score2 = (回流 * (1 + h-2回流留存 + h-3回流留存))/(view+1000)
 | 
											
												
													
														|  | 
 |  | +    df['hour_score2'] = (df['lastonehour_return'] * (1 + df['return_retention_2h'] + df['return_retention_3h'])) / \
 | 
											
												
													
														|  | 
 |  | +                   (df['lastonehour_view'] + 1000)
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    # score3 = (lastthreehour_return_now + lasttwohour_return_now + lastonehour_return)/(lastonehour_view+1000)
 | 
											
												
													
														|  | 
 |  | +    df['hour_score3'] = (df['lastthreehour_return_now'] + df['lasttwohour_return_now'] + df['lastonehour_return']) / \
 | 
											
												
													
														|  | 
 |  | +                   (df['lastonehour_view'] + 1000)
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    # score4 = share/view * back_rate * logback
 | 
											
												
													
														|  | 
 |  | +    df['hour_score4'] = df['share_rate_view'] * df['back_rate'] * df['log_back']
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    # score5 = share/view * (back_rate + back_rate_2h + back_rate_3h) * logback
 | 
											
												
													
														|  | 
 |  | +    df['hour_score5'] = df['share_rate_view'] * (df['back_rate'] + df['back_rate_2h'] + df['back_rate_3h']) * df['log_back']
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    score_df = df[['videoid', 'hour_score1', 'hour_score2', 'hour_score3', 'hour_score4', 'hour_score5']]
 | 
											
												
													
														|  | 
 |  | +    # print(score_df)
 | 
											
												
													
														|  | 
 |  | +    return score_df
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +if __name__ == "__main__":
 | 
											
												
													
														|  | 
 |  | +    # 1.load data
 | 
											
												
													
														|  | 
 |  | +    now_date = sys.argv[1]
 | 
											
												
													
														|  | 
 |  | +    print(f"now_date: {now_date}")
 | 
											
												
													
														|  | 
 |  | +    data_path = f"./data/hour_video_data_{now_date}.csv"
 | 
											
												
													
														|  | 
 |  | +    data_df = data_group(data_path=data_path)
 | 
											
												
													
														|  | 
 |  | +    print(f"data_df shape: {data_df.shape}")
 | 
											
												
													
														|  | 
 |  | +    hour_score_path = f"./data/hour_score_{now_date}.csv"
 | 
											
												
													
														|  | 
 |  | +    score_df = cal_score(data_df=data_df)
 | 
											
												
													
														|  | 
 |  | +    score_df.to_csv(hour_score_path, index=False)
 | 
											
												
													
														|  | 
 |  | +    print(f"score_df shape: {score_df.shape}")
 |