1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071 |
- """
- process the data to satisfy the lightgbm
- """
- import sys
- import os
- import json
- from tqdm import tqdm
- sys.path.append(os.getcwd())
- from functions import generate_label_date
- def generate_train_label(item, y_ori_data, cate):
- """
- 生成训练数据,用 np.array矩阵的方式返回,
- :return: x_train, 训练数据, y_train, 训练 label
- """
- video_id = item['video_id']
- dt = item['dt']
- userful_features = [
- "uid",
- "type",
- "channel",
- "fans",
- "view_count_user_30days",
- "share_count_user_30days",
- "return_count_user_30days",
- "rov_user",
- "str_user",
- "out_user_id",
- "mode",
- "out_play_cnt",
- "out_like_cnt",
- "out_share_cnt",
- "out_collection_cnt"
- ]
- item_features = [item[i] for i in userful_features]
- label_dt = generate_label_date(dt)
- label_obj = y_ori_data.get(label_dt, {}).get(video_id)
- if label_obj:
- label = int(label_obj[cate]) if label_obj[cate] else 0
- else:
- label = 0
- return label, item_features
- if __name__ == '__main__':
- x_path = 'prid_data/train_0314_0317.json'
- y_path = 'data/daily-label-20240315-20240321.json'
- with open(x_path) as f:
- x_data = json.loads(f.read())
- with open(y_path) as f:
- y_data = json.loads(f.read())
- cate_list = ['total_return']
- for c in cate_list:
- x_list = []
- y_list = []
- for video_obj in tqdm(x_data):
- print(video_obj)
- our_label, features = generate_train_label(video_obj, y_data, c)
- x_list.append(features)
- y_list.append(our_label)
- # print(len(y_list))
- with open("whole_data/x_data_{}_prid.json".format(c), "w") as f1:
- f1.write(json.dumps(x_list, ensure_ascii=False))
- with open("whole_data/y_data_{}_prid.json".format(c), "w") as f2:
- f2.write(json.dumps(y_list, ensure_ascii=False))
|