12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485 |
- # coding utf-8
- import sys
- import json
- import math
- import pandas as pd
- features = [
- 'apptype',
- 'videoid',
- 'preview人数', # 过去24h预曝光人数
- 'view人数', # 过去24h曝光人数
- 'play人数', # 过去24h播放人数
- 'share人数', # 过去24h分享人数
- '回流人数', # 过去24h分享,过去24h回流人数
- 'preview次数', # 过去24h预曝光次数
- 'view次数', # 过去24h曝光次数
- 'play次数', # 过去24h播放次数
- 'share次数', # 过去24h分享次数
- 'platform_return',
- 'platform_preview',
- 'platform_preview_total',
- 'platform_show',
- 'platform_show_total',
- 'platform_view',
- 'platform_view_total',
- ]
- def data_group(data_path):
- """将数据按照videoid聚合(求和)"""
- f = open(data_path)
- index = 0
- data_dict = {}
- while True:
- line = f.readline()
- if not line:
- break
- if index == 0:
- index += 1
- continue
- index += 1
- items = line.strip().split(",")
- # print(items)
- if len(items) < len(features):
- continue
- video_id = items[1]
- if video_id not in data_dict:
- data_dict[video_id] = {'videoid': video_id}
- for i, feature in enumerate(features):
- if feature in ['apptype', 'videoid']:
- continue
- data_dict[video_id][feature] = int(float(items[i]))
- else:
- for i, feature in enumerate(features):
- if feature in ['apptype', 'videoid']:
- continue
- data_dict[video_id][feature] = data_dict[video_id][feature] + int(float(items[i]))
- f.close()
- data_list = [item for video_id, item in data_dict.items()]
- data_df = pd.DataFrame(data_list)
- return data_df
- def cal_score(data_df):
- """计算score"""
- df = data_df.copy()
- # score1 = 回流/(view+10)
- df['24h_score1'] = df['回流人数'] / (df['view人数'] + 10)
- score_df = df[['videoid', '24h_score1']]
- # print(score_df)
- return score_df
- if __name__ == "__main__":
- # 1.load data
- now_date = sys.argv[1]
- print(f"now_date: {now_date}")
- data_path = f"./data/24h_video_data_{now_date}.csv"
- data_df = data_group(data_path=data_path)
- print(f"data_df shape: {data_df.shape}")
- hour_score_path = f"./data/24h_score_{now_date}.csv"
- score_df = cal_score(data_df=data_df)
- score_df.to_csv(hour_score_path, index=False)
- print(f"score_df shape: {score_df.shape}")
|