import sys import os import json from tqdm import tqdm sys.path.append(os.getcwd()) from functions.generate_data import generate_label_date def generate_train_label(item, y_ori_data): """ 生成训练数据,用 np.array矩阵的方式返回, :return: x_train, 训练数据, y_train, 训练 label """ video_id = item['video_id'] dt = item['dt'] userful_features = [ "uid", "type", "channel", "fans", "view_count_user_30days", "share_count_user_30days", "return_count_user_30days", "rov_user", "str_user", "out_user_id", "mode", "out_play_cnt", "out_like_cnt", "out_share_cnt", "out_collection_cnt" ] item_features = [item[i] for i in userful_features] label_dt = generate_label_date(dt) label_obj = y_ori_data.get(label_dt, {}).get(video_id) if label_obj: label = int(label_obj['total_return']) if label_obj['total_return'] else None else: label = None return label, item_features if __name__ == '__main__': x_path = 'data/0312-0317_hour_train.json' y_path = 'data/12-17-train_label.json' with open(x_path) as f: x_data = json.loads(f.read()) with open(y_path) as f: y_data = json.loads(f.read()) x_list = [] y_list = [] for video_obj in tqdm(x_data): our_label, features = generate_train_label(video_obj, y_data) if our_label: x_list.append(features) y_list.append(our_label) print(len(y_list)) with open("whole_data/x_data.json", "w") as f1: f1.write(json.dumps(x_list, ensure_ascii=False)) with open("whole_data/y_data.json", "w") as f2: f2.write(json.dumps(y_list, ensure_ascii=False))