123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566 |
- import sys
- import os
- import json
- from tqdm import tqdm
- sys.path.append(os.getcwd())
- from functions.generate_data import generate_label_date
- def generate_train_label(item, y_ori_data):
- """
- 生成训练数据,用 np.array矩阵的方式返回,
- :return: x_train, 训练数据, y_train, 训练 label
- """
- video_id = item['video_id']
- dt = item['dt']
- userful_features = [
- "uid",
- "type",
- "channel",
- "fans",
- "view_count_user_30days",
- "share_count_user_30days",
- "return_count_user_30days",
- "rov_user",
- "str_user",
- "out_user_id",
- "mode",
- "out_play_cnt",
- "out_like_cnt",
- "out_share_cnt",
- "out_collection_cnt"
- ]
- item_features = [item[i] for i in userful_features]
- label_dt = generate_label_date(dt)
- label_obj = y_ori_data.get(label_dt, {}).get(video_id)
- if label_obj:
- label = int(label_obj['total_return']) if label_obj['total_return'] else None
- else:
- label = None
- return label, item_features
- if __name__ == '__main__':
- x_path = 'data/train_january.json'
- y_path = 'data/jan_feb_label.json'
- with open(x_path) as f:
- x_data = json.loads(f.read())
- with open(y_path) as f:
- y_data = json.loads(f.read())
- x_list = []
- y_list = []
- for video_obj in tqdm(x_data):
- print(video_obj)
- our_label, features = generate_train_label(video_obj, y_data)
- if our_label:
- x_list.append(features)
- y_list.append(our_label)
- # print(len(y_list))
- with open("whole_data/x_data.json", "w") as f1:
- f1.write(json.dumps(x_list, ensure_ascii=False))
- with open("whole_data/y_data.json", "w") as f2:
- f2.write(json.dumps(y_list, ensure_ascii=False))
|